[PATCH] eal/x86: improve rte_memcpy const size 16 performance

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH] eal/x86: improve rte_memcpy const size 16 performance
@ 2024-03-02 23:48 Morten Brørup
  2024-03-03  0:38 ` Morten Brørup
                   ` (12 more replies)
  0 siblings, 13 replies; 31+ messages in thread
From: Morten Brørup @ 2024-03-02 23:48 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is knownto be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/eal/x86/include/rte_memcpy.h | 224 ++++++++-----------------------
 1 file changed, 54 insertions(+), 170 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..6cc0e8ee16 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2024 SmartShare Systems
  */
 
 #ifndef _RTE_MEMCPY_X86_64_H_
@@ -91,14 +92,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +112,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || defined __AVX__
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+#define ALIGNMENT_MASK 0x3F
+
+/**
+ * AVX512 implementation below
+ */
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -321,73 +340,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * AVX2 implementation below
  */
 
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
-
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -437,15 +389,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -519,85 +470,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
  * SSE & AVX implementation below
  */
 
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
-
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
  * 47 bytes leftover maximum,
@@ -710,20 +582,26 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+#if defined __AVX__
+		rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n);
+#else /* SSE implementation */
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
+#endif
 		return ret;
 	}
 	if (n <= 128) {
@@ -828,8 +706,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
@ 2024-03-03  0:38 ` Morten Brørup
  2024-03-03  5:40 ` Stephen Hemminger
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-03-03  0:38 UTC (permalink / raw)
  To: dev

Recheck-request: iol-broadcom-Performance

Patch only modifies x86 code, but fails performance on aarch64.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
  2024-03-03  0:38 ` Morten Brørup
@ 2024-03-03  5:40 ` Stephen Hemminger
  2024-03-03  5:47   ` Stephen Hemminger
  2024-03-03  5:58   ` Stephen Hemminger
  2024-03-03  5:41 ` Stephen Hemminger
                   ` (10 subsequent siblings)
  12 siblings, 2 replies; 31+ messages in thread
From: Stephen Hemminger @ 2024-03-03  5:40 UTC (permalink / raw)
  To: Morten Brørup
  Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev

On Sun,  3 Mar 2024 00:48:12 +0100
Morten Brørup <mb@smartsharesystems.com> wrote:

> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is knownto be 16 at build tine, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---

Looks good, let me see how it looks in goldbolt vs Gcc.

One other issue is that for the non-constant case, rte_memcpy has an excessively
large inline code footprint. That is one of the reasons Gcc doesn't always
inline.  For > 128 bytes, it really should be a function.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-03  5:40 ` Stephen Hemminger
@ 2024-03-03  5:47   ` Stephen Hemminger
  2024-03-03  5:58     ` Stephen Hemminger
  2024-03-03  5:58   ` Stephen Hemminger
  1 sibling, 1 reply; 31+ messages in thread
From: Stephen Hemminger @ 2024-03-03  5:47 UTC (permalink / raw)
  To: Morten Brørup
  Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev

While doing some tests with -Wall and -Wextra with current code.
Saw that it doesn't really always get inlined anyway.

In file included from /usr/lib/gcc/x86_64-linux-gnu/13/include/immintrin.h:37,
                 from /usr/lib/gcc/x86_64-linux-gnu/13/include/x86intrin.h:32,
                 from ethcopy.c:4:
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h: In function ‘rte_memcpy_generic’:
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  553 |     case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  554 |     case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  555 |     case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  556 |     case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  557 |     case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  558 |     case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  559 |     case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  560 |     case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  561 |     case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  562 |     case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  563 |     case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  564 |     case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  565 |     case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  566 |     case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:504:9: note: called from here
  504 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:505:9: note: called from here
  505 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:506:9: note: called from here
  506 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:507:9: note: called from here
  507 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:508:9: note: called from here
  508 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:509:9: note: called from here
  509 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:510:9: note: called from here
  510 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:511:9: note: called from here
  511 |         _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));        \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:526:13: note: called from here
  526 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch
  185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
      | ^~~~~~~~~~~~~~~
ethcopy.c:527:13: note: called from here
  527 |             _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));    \
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’
  567 |     case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break;    \
      |                ^~~~~~~~~~~~~~~~~~~~~~~~
ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’
  693 |         MOVEUNALIGNED_LEFT47(dst, src, n, srcofs);
      |         ^~~~~~~~~~~~~~~~~~~~

Compilation exited abnormally with code 1 at Sat Mar  2 21:43:50

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-03  5:47   ` Stephen Hemminger
@ 2024-03-03  5:58     ` Stephen Hemminger
  0 siblings, 0 replies; 31+ messages in thread
From: Stephen Hemminger @ 2024-03-03  5:58 UTC (permalink / raw)
  To: Morten Brørup
  Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev

On Sat, 2 Mar 2024 21:47:08 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:

> While doing some tests with -Wall and -Wextra with current code.
> Saw that it doesn't really always get inlined anyway.


NVM needed -march=native

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-03  5:40 ` Stephen Hemminger
  2024-03-03  5:47   ` Stephen Hemminger
@ 2024-03-03  5:58   ` Stephen Hemminger
  2024-03-03 10:07     ` Morten Brørup
  1 sibling, 1 reply; 31+ messages in thread
From: Stephen Hemminger @ 2024-03-03  5:58 UTC (permalink / raw)
  To: Morten Brørup
  Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev

On Sat, 2 Mar 2024 21:40:03 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:

> On Sun,  3 Mar 2024 00:48:12 +0100
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > In the case where the size is knownto be 16 at build tine, omit the
> > duplicate copy.
> > 
> > Reduced the amount of effectively copy-pasted code by using #ifdef
> > inside functions instead of outside functions.
> > 
> > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---  
> 
> Looks good, let me see how it looks in goldbolt vs Gcc.
> 
> One other issue is that for the non-constant case, rte_memcpy has an excessively
> large inline code footprint. That is one of the reasons Gcc doesn't always
> inline.  For > 128 bytes, it really should be a function.

For size of 4,6,8,16, 32, 64, up to 128 Gcc inline and rte_memcpy match.

For size 128. It looks gcc is simpler.

rte_copy_addr:
        vmovdqu ymm0, YMMWORD PTR [rsi]
        vextracti128    XMMWORD PTR [rdi+16], ymm0, 0x1
        vmovdqu XMMWORD PTR [rdi], xmm0
        vmovdqu ymm0, YMMWORD PTR [rsi+32]
        vextracti128    XMMWORD PTR [rdi+48], ymm0, 0x1
        vmovdqu XMMWORD PTR [rdi+32], xmm0
        vmovdqu ymm0, YMMWORD PTR [rsi+64]
        vextracti128    XMMWORD PTR [rdi+80], ymm0, 0x1
        vmovdqu XMMWORD PTR [rdi+64], xmm0
        vmovdqu ymm0, YMMWORD PTR [rsi+96]
        vextracti128    XMMWORD PTR [rdi+112], ymm0, 0x1
        vmovdqu XMMWORD PTR [rdi+96], xmm0
        vzeroupper
        ret
copy_addr:
        vmovdqu ymm0, YMMWORD PTR [rsi]
        vmovdqu YMMWORD PTR [rdi], ymm0
        vmovdqu ymm1, YMMWORD PTR [rsi+32]
        vmovdqu YMMWORD PTR [rdi+32], ymm1
        vmovdqu ymm2, YMMWORD PTR [rsi+64]
        vmovdqu YMMWORD PTR [rdi+64], ymm2
        vmovdqu ymm3, YMMWORD PTR [rsi+96]
        vmovdqu YMMWORD PTR [rdi+96], ymm3
        vzeroupper
        ret

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-03  5:58   ` Stephen Hemminger
@ 2024-03-03 10:07     ` Morten Brørup
  0 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-03-03 10:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Sunday, 3 March 2024 06.58
> 
> On Sat, 2 Mar 2024 21:40:03 -0800
> Stephen Hemminger <stephen@networkplumber.org> wrote:
> 
> > On Sun,  3 Mar 2024 00:48:12 +0100
> > Morten Brørup <mb@smartsharesystems.com> wrote:
> >
> > > When the rte_memcpy() size is 16, the same 16 bytes are copied
> twice.
> > > In the case where the size is knownto be 16 at build tine, omit the
> > > duplicate copy.
> > >
> > > Reduced the amount of effectively copy-pasted code by using #ifdef
> > > inside functions instead of outside functions.
> > >
> > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > ---
> >
> > Looks good, let me see how it looks in goldbolt vs Gcc.
> >
> > One other issue is that for the non-constant case, rte_memcpy has an
> excessively
> > large inline code footprint. That is one of the reasons Gcc doesn't
> always
> > inline.  For > 128 bytes, it really should be a function.

Yes, the code footprint is significant for the non-constant case.
I suppose Intel considered the cost and benefits when they developed this.
Or perhaps they just wanted a showcase for their new and shiny vector instructions. ;-)

Inlining might provide significant branch prediction benefits in cases where the size is not build-time constant, but run-time constant.

> 
> For size of 4,6,8,16, 32, 64, up to 128 Gcc inline and rte_memcpy match.
> 
> For size 128. It looks gcc is simpler.
> 
> rte_copy_addr:
>         vmovdqu ymm0, YMMWORD PTR [rsi]
>         vextracti128    XMMWORD PTR [rdi+16], ymm0, 0x1
>         vmovdqu XMMWORD PTR [rdi], xmm0
>         vmovdqu ymm0, YMMWORD PTR [rsi+32]
>         vextracti128    XMMWORD PTR [rdi+48], ymm0, 0x1
>         vmovdqu XMMWORD PTR [rdi+32], xmm0
>         vmovdqu ymm0, YMMWORD PTR [rsi+64]
>         vextracti128    XMMWORD PTR [rdi+80], ymm0, 0x1
>         vmovdqu XMMWORD PTR [rdi+64], xmm0
>         vmovdqu ymm0, YMMWORD PTR [rsi+96]
>         vextracti128    XMMWORD PTR [rdi+112], ymm0, 0x1
>         vmovdqu XMMWORD PTR [rdi+96], xmm0
>         vzeroupper
>         ret

Interesting. Playing around with Godbolt revealed that GCC version < 11 creates the above from rte_memcpy, whereas GCC version >= 11 does it correctly. Clang doesn't have this issue.
I guess that's why the original code treated AVX as SSE.
Fixed in v2.

> copy_addr:
>         vmovdqu ymm0, YMMWORD PTR [rsi]
>         vmovdqu YMMWORD PTR [rdi], ymm0
>         vmovdqu ymm1, YMMWORD PTR [rsi+32]
>         vmovdqu YMMWORD PTR [rdi+32], ymm1
>         vmovdqu ymm2, YMMWORD PTR [rsi+64]
>         vmovdqu YMMWORD PTR [rdi+64], ymm2
>         vmovdqu ymm3, YMMWORD PTR [rsi+96]
>         vmovdqu YMMWORD PTR [rdi+96], ymm3
>         vzeroupper
>         ret

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
  2024-03-03  0:38 ` Morten Brørup
  2024-03-03  5:40 ` Stephen Hemminger
@ 2024-03-03  5:41 ` Stephen Hemminger
  2024-03-03  9:46 ` [PATCH v2] " Morten Brørup
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Stephen Hemminger @ 2024-03-03  5:41 UTC (permalink / raw)
  To: Morten Brørup
  Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev

On Sun,  3 Mar 2024 00:48:12 +0100
Morten Brørup <mb@smartsharesystems.com> wrote:

> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..6cc0e8ee16 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2010-2014 Intel Corporation
> + * Copyright(c) 2024 SmartShare Systems
>   */

Lets not start the precedent of adding individual copyrights on patches.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (2 preceding siblings ...)
  2024-03-03  5:41 ` Stephen Hemminger
@ 2024-03-03  9:46 ` Morten Brørup
  2024-04-04  9:18   ` Morten Brørup
  2024-04-04 10:07   ` Bruce Richardson
  2024-03-03 16:05 ` [PATCH] " Stephen Hemminger
                   ` (8 subsequent siblings)
  12 siblings, 2 replies; 31+ messages in thread
From: Morten Brørup @ 2024-03-03  9:46 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
 1 file changed, 56 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..d1df841f5e 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || \
+		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +332,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
+#elif defined __AVX2__ || \
+		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))
 
 /**
- * AVX2 implementation below
+ * AVX2 (and AVX, unless too old GCC version) implementation below
  */
 
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +390,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +465,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * SSE (and AVX, with too old GCC version) implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +585,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +699,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +714,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-03  9:46 ` [PATCH v2] " Morten Brørup
@ 2024-04-04  9:18   ` Morten Brørup
  2024-04-04 10:07   ` Bruce Richardson
  1 sibling, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-04-04  9:18 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev; +Cc: mattias.ronnblom, dev, stephen

PING Intel x86 maintainers for review.

> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Sunday, 3 March 2024 10.46
> 
> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build tine, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)
> ---
>  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
>  1 file changed, 56 insertions(+), 175 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..d1df841f5e 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>  	return ret;
>  }
> 
> -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> -
> -#define ALIGNMENT_MASK 0x3F
> -
> -/**
> - * AVX512 implementation below
> - */
> -
>  /**
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
> @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src)
>  {
> +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ ||
> \
> +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> < 110000)))
>  	__m256i ymm0;
> 
>  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
>  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +#endif
>  }
> 
>  /**
> @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov64(uint8_t *dst, const uint8_t *src)
>  {
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>  	__m512i zmm0;
> 
>  	zmm0 = _mm512_loadu_si512((const void *)src);
>  	_mm512_storeu_si512((void *)dst, zmm0);
> +#else /* AVX2, AVX & SSE implementation */
> +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +#endif
>  }
> 
>  /**
> @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov256(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> +	rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
> 
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> +
> +/**
> + * AVX512 implementation below
> + */
> +
> +#define ALIGNMENT_MASK 0x3F
> +
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
> @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
> +	if (__builtin_constant_p(n) && n == 64) {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		rte_mov32((uint8_t *)dst - 32 + n,
> @@ -313,80 +332,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_128_BACK63;
>  }
> 
> -#elif defined __AVX2__
> -
> -#define ALIGNMENT_MASK 0x1F
> +#elif defined __AVX2__ || \
> +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> < 110000)))
> 
>  /**
> - * AVX2 implementation below
> + * AVX2 (and AVX, unless too old GCC version) implementation below
>   */
> 
> -/**
> - * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> -{
> -	__m128i xmm0;
> -
> -	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
> -	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
> -}
> -
> -/**
> - * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> -{
> -	__m256i ymm0;
> -
> -	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
> -	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
> -}
> -
> -/**
> - * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -}
> -
> -/**
> - * Copy 128 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> -}
> -
> -/**
> - * Copy 256 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> -	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
> -	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
> -	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
> -	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
> -}
> +#define ALIGNMENT_MASK 0x1F
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
> @@ -437,15 +390,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		return ret;
>  	}
> -	if (n <= 48) {
> +	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
> @@ -513,90 +465,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> 
>  #else /* __AVX512F__ */
> 
> -#define ALIGNMENT_MASK 0x0F
> -
> -/**
> - * SSE & AVX implementation below
> - */
> -
> -/**
> - * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> -{
> -	__m128i xmm0;
> -
> -	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
> -	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
> -}
> -
> -/**
> - * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -}
> -
>  /**
> - * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -}
> -
> -/**
> - * Copy 128 bytes from one location to another,
> - * locations should not overlap.
> + * SSE (and AVX, with too old GCC version) implementation below
>   */
> -static __rte_always_inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -}
> 
> -/**
> - * Copy 256 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> -}
> +#define ALIGNMENT_MASK 0x0F
> 
>  /**
>   * Macro for copying unaligned block from one location to another with
> constant load offset,
> @@ -712,17 +585,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 */
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
> -		return ret;
> -	}
> -	if (n <= 48) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
>  		return ret;
>  	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +		if (n > 48)
> +			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
>  		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
>  		return ret;
>  	}
> @@ -828,8 +699,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  	}
> 
>  	/* Copy 16 <= size <= 32 bytes */
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
> 
> @@ -837,6 +714,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  	}
> 
>  	/* Copy 32 < size <= 64 bytes */
> +	if (__builtin_constant_p(n) && n == 64) {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		rte_mov32((uint8_t *)dst - 32 + n,
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-03  9:46 ` [PATCH v2] " Morten Brørup
  2024-04-04  9:18   ` Morten Brørup
@ 2024-04-04 10:07   ` Bruce Richardson
  2024-04-04 11:19     ` Morten Brørup
  1 sibling, 1 reply; 31+ messages in thread
From: Bruce Richardson @ 2024-04-04 10:07 UTC (permalink / raw)
  To: Morten Brørup; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev

On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build tine, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>

Changes in general look good to me. Comments inline below.

/Bruce

> ---
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)
> ---
>  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
>  1 file changed, 56 insertions(+), 175 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..d1df841f5e 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>  	return ret;
>  }
>  
> -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> -
> -#define ALIGNMENT_MASK 0x3F
> -
> -/**
> - * AVX512 implementation below
> - */
> -
>  /**
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
> @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src)
>  {
> +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || \
> +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))

I think we can drop the AVX512 checks here, since I'm not aware of any
system where we'd have AVX512 but not AVX2 available, so just checking for
AVX2 support should be sufficient.

On the final compiler-based check, I don't strongly object to it, but I
just wonder as to its real value. AVX2 was first introduced by Intel over 10
years ago, and (from what I find in wikipedia), it's been in AMD CPUs since
~2015. While we did have CPUs still being produced without AVX2 since that
time, they generally didn't have AVX1 either, only having SSE instructions.
Therefore the number of systems which require this additional check is
likely very small at this stage.
That said, I'm ok to either keep or omit it at your choice. If you do keep
it, how about putting the check once at the top of the file and using a
single short define instead for the multiple places it's used e.g.

#if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)))
#define RTE_MEMCPY_AVX2
#endif


>  	__m256i ymm0;
>  
>  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
>  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +#endif
>  }
>  
>  /**
> @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov64(uint8_t *dst, const uint8_t *src)
>  {
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>  	__m512i zmm0;
>  
>  	zmm0 = _mm512_loadu_si512((const void *)src);
>  	_mm512_storeu_si512((void *)dst, zmm0);
> +#else /* AVX2, AVX & SSE implementation */
> +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +#endif
>  }
>  
>  /**
> @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov256(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> +	rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
>  
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> +
> +/**
> + * AVX512 implementation below
> + */
> +
> +#define ALIGNMENT_MASK 0x3F
> +
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
> @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> +	if (__builtin_constant_p(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}

There's an outstanding patchset from Stephen to replace all use of
rte_memcpy with a constant parameter with an actual call to regular memcpy.
On a wider scale should we not look to do something similar in this file,
have calls to rte_memcpy with constant parameter always turn into a call to
regular memcpy? We used to have such a macro in older DPDK e.g.
from DPDK 1.8

http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?h=v1.8.0#n171

This would elminiate the need to put in constant_p checks all through the
code.

>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__builtin_constant_p(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
<snip>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-04-04 10:07   ` Bruce Richardson
@ 2024-04-04 11:19     ` Morten Brørup
  2024-04-04 13:29       ` Bruce Richardson
  0 siblings, 1 reply; 31+ messages in thread
From: Morten Brørup @ 2024-04-04 11:19 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Thursday, 4 April 2024 12.07
> 
> On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > In the case where the size is known to be 16 at build tine, omit the
> > duplicate copy.
> >
> > Reduced the amount of effectively copy-pasted code by using #ifdef
> > inside functions instead of outside functions.
> >
> > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> 
> Changes in general look good to me. Comments inline below.
> 
> /Bruce
> 
> > ---
> > v2:
> > * For GCC, version 11 is required for proper AVX handling;
> >   if older GCC version, treat AVX as SSE.
> >   Clang does not have this issue.
> >   Note: Original code always treated AVX as SSE, regardless of compiler.
> > * Do not add copyright. (Stephen Hemminger)
> > ---
> >  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
> >  1 file changed, 56 insertions(+), 175 deletions(-)
> >
> > diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> > index 72a92290e0..d1df841f5e 100644
> > --- a/lib/eal/x86/include/rte_memcpy.h
> > +++ b/lib/eal/x86/include/rte_memcpy.h
> > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
> >  	return ret;
> >  }
> >
> > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > -
> > -#define ALIGNMENT_MASK 0x3F
> > -
> > -/**
> > - * AVX512 implementation below
> > - */
> > -
> >  /**
> >   * Copy 16 bytes from one location to another,
> >   * locations should not overlap.
> > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> >  static __rte_always_inline void
> >  rte_mov32(uint8_t *dst, const uint8_t *src)
> >  {
> > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__
> || \
> > +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> < 110000)))
> 
> I think we can drop the AVX512 checks here, since I'm not aware of any
> system where we'd have AVX512 but not AVX2 available, so just checking for
> AVX2 support should be sufficient.

RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512:
https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memcpy.h#L98

Without it, the AVX2 version will be used, regardless if the CPU has AVX512.

Also, there are some binutils bugs that might disable compilation for AVX512:
https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4
https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17

> 
> On the final compiler-based check, I don't strongly object to it, but I
> just wonder as to its real value. AVX2 was first introduced by Intel over 10
> years ago, and (from what I find in wikipedia), it's been in AMD CPUs since
> ~2015. While we did have CPUs still being produced without AVX2 since that
> time, they generally didn't have AVX1 either, only having SSE instructions.
> Therefore the number of systems which require this additional check is
> likely very small at this stage.
> That said, I'm ok to either keep or omit it at your choice.

I kept it for consistency, and to support older compilers still officially supported by DPDK.

I don't feel qualified to change support for CPU features; I'll leave that to the CPU vendors.
Also, I have no clue what has been produced by Intel and AMD. :-)

> If you do keep
> it, how about putting the check once at the top of the file and using a
> single short define instead for the multiple places it's used e.g.
> 
> #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION <
> 110000)))
> #define RTE_MEMCPY_AVX2
> #endif

Much of the code reorganization in this patch was done with the intention to improve readability.

And I don't think this suggestion improves readability; especially considering that RTE_MEMCPY_AVX512 is something manually defined.

However, I get your point; and if the conditional was very long or very complex, I might agree to a "shadow" definition to keep it short.

> 
> 
> >  	__m256i ymm0;
> >
> >  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
> >  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> > +#else /* SSE implementation */
> > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > +#endif
> >  }
> >
> >  /**
> > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> >  static __rte_always_inline void
> >  rte_mov64(uint8_t *dst, const uint8_t *src)
> >  {
> > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> >  	__m512i zmm0;
> >
> >  	zmm0 = _mm512_loadu_si512((const void *)src);
> >  	_mm512_storeu_si512((void *)dst, zmm0);
> > +#else /* AVX2, AVX & SSE implementation */
> > +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > +#endif
> >  }
> >
> >  /**
> > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
> >  static __rte_always_inline void
> >  rte_mov256(uint8_t *dst, const uint8_t *src)
> >  {
> > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> > +	rte_mov128(dst + 1 * 128, src + 1 * 128);
> >  }
> >
> > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > +
> > +/**
> > + * AVX512 implementation below
> > + */
> > +
> > +#define ALIGNMENT_MASK 0x3F
> > +
> >  /**
> >   * Copy 128-byte blocks from one location to another,
> >   * locations should not overlap.
> > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t
> n)
> >  	/**
> >  	 * Fast way when copy size doesn't exceed 512 bytes
> >  	 */
> > +	if (__builtin_constant_p(n) && n == 32) {
> > +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > +		return ret;
> > +	}
> 
> There's an outstanding patchset from Stephen to replace all use of
> rte_memcpy with a constant parameter with an actual call to regular memcpy.
> On a wider scale should we not look to do something similar in this file,
> have calls to rte_memcpy with constant parameter always turn into a call to
> regular memcpy? We used to have such a macro in older DPDK e.g.
> from DPDK 1.8
> 
> http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp
> y.h?h=v1.8.0#n171
> 
> This would elminiate the need to put in constant_p checks all through the
> code.

The old macro in DPDK 1.8 was removed with the description "Remove slow glibc call for constant copies":
https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475

Stephen believes that the memcpy() built-ins provided by compilers are faster than rte_memcpy() for constant size.
I'm not convinced.
Such a change should be backed up by performance tests, preferably for all supported compilers - especially the old compilers that come with some of the supported distros might not be as good as we would hope.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-04-04 11:19     ` Morten Brørup
@ 2024-04-04 13:29       ` Bruce Richardson
  2024-04-04 15:37         ` Morten Brørup
  0 siblings, 1 reply; 31+ messages in thread
From: Bruce Richardson @ 2024-04-04 13:29 UTC (permalink / raw)
  To: Morten Brørup; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev

On Thu, Apr 04, 2024 at 01:19:54PM +0200, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > Sent: Thursday, 4 April 2024 12.07
> > 
> > On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > > In the case where the size is known to be 16 at build tine, omit the
> > > duplicate copy.
> > >
> > > Reduced the amount of effectively copy-pasted code by using #ifdef
> > > inside functions instead of outside functions.
> > >
> > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > 
> > Changes in general look good to me. Comments inline below.
> > 
> > /Bruce
> > 
> > > ---
> > > v2:
> > > * For GCC, version 11 is required for proper AVX handling;
> > >   if older GCC version, treat AVX as SSE.
> > >   Clang does not have this issue.
> > >   Note: Original code always treated AVX as SSE, regardless of compiler.
> > > * Do not add copyright. (Stephen Hemminger)
> > > ---
> > >  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
> > >  1 file changed, 56 insertions(+), 175 deletions(-)
> > >
> > > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > b/lib/eal/x86/include/rte_memcpy.h
> > > index 72a92290e0..d1df841f5e 100644
> > > --- a/lib/eal/x86/include/rte_memcpy.h
> > > +++ b/lib/eal/x86/include/rte_memcpy.h
> > > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
> > >  	return ret;
> > >  }
> > >
> > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > -
> > > -#define ALIGNMENT_MASK 0x3F
> > > -
> > > -/**
> > > - * AVX512 implementation below
> > > - */
> > > -
> > >  /**
> > >   * Copy 16 bytes from one location to another,
> > >   * locations should not overlap.
> > > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> > >  static __rte_always_inline void
> > >  rte_mov32(uint8_t *dst, const uint8_t *src)
> > >  {
> > > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__
> > || \
> > > +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION
> > < 110000)))
> > 
> > I think we can drop the AVX512 checks here, since I'm not aware of any
> > system where we'd have AVX512 but not AVX2 available, so just checking for
> > AVX2 support should be sufficient.
> 
> RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512:
> https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memcpy.h#L98
> 
> Without it, the AVX2 version will be used, regardless if the CPU has AVX512.
> 
> Also, there are some binutils bugs that might disable compilation for AVX512:
> https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4
> https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17
> 

Yes, I realise that, but the guard here is for an AVX2 block only, so there
is no point in checking for AVX512 - it's AVX512 or AVX2.

> > 
> > On the final compiler-based check, I don't strongly object to it, but I
> > just wonder as to its real value. AVX2 was first introduced by Intel over 10
> > years ago, and (from what I find in wikipedia), it's been in AMD CPUs since
> > ~2015. While we did have CPUs still being produced without AVX2 since that
> > time, they generally didn't have AVX1 either, only having SSE instructions.
> > Therefore the number of systems which require this additional check is
> > likely very small at this stage.
> > That said, I'm ok to either keep or omit it at your choice.
> 
> I kept it for consistency, and to support older compilers still officially supported by DPDK.
> 
> I don't feel qualified to change support for CPU features; I'll leave that to the CPU vendors.
> Also, I have no clue what has been produced by Intel and AMD. :-)
> 
> > If you do keep
> > it, how about putting the check once at the top of the file and using a
> > single short define instead for the multiple places it's used e.g.
> > 
> > #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION <
> > 110000)))
> > #define RTE_MEMCPY_AVX2
> > #endif
> 
> Much of the code reorganization in this patch was done with the intention to improve readability.
> 
> And I don't think this suggestion improves readability; especially considering that RTE_MEMCPY_AVX512 is something manually defined.
> 
> However, I get your point; and if the conditional was very long or very complex, I might agree to a "shadow" definition to keep it short.
> 

I just find it long enough that duplication of it seems painful. :-) I'd
rather we check once at the top if we can use an AVX copy vs SSE, rather
than duplicate the compiler version checks multiple times.


> > 
> > 
> > >  	__m256i ymm0;
> > >
> > >  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
> > >  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> > > +#else /* SSE implementation */
> > > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> > > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> > > +#endif
> > >  }
> > >
> > >  /**
> > > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> > >  static __rte_always_inline void
> > >  rte_mov64(uint8_t *dst, const uint8_t *src)
> > >  {
> > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > >  	__m512i zmm0;
> > >
> > >  	zmm0 = _mm512_loadu_si512((const void *)src);
> > >  	_mm512_storeu_si512((void *)dst, zmm0);
> > > +#else /* AVX2, AVX & SSE implementation */
> > > +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> > > +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> > > +#endif
> > >  }
> > >
> > >  /**
> > > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
> > >  static __rte_always_inline void
> > >  rte_mov256(uint8_t *dst, const uint8_t *src)
> > >  {
> > > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > > +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> > > +	rte_mov128(dst + 1 * 128, src + 1 * 128);
> > >  }
> > >
> > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > +
> > > +/**
> > > + * AVX512 implementation below
> > > + */
> > > +
> > > +#define ALIGNMENT_MASK 0x3F
> > > +
> > >  /**
> > >   * Copy 128-byte blocks from one location to another,
> > >   * locations should not overlap.
> > > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t
> > n)
> > >  	/**
> > >  	 * Fast way when copy size doesn't exceed 512 bytes
> > >  	 */
> > > +	if (__builtin_constant_p(n) && n == 32) {
> > > +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > > +		return ret;
> > > +	}
> > 
> > There's an outstanding patchset from Stephen to replace all use of
> > rte_memcpy with a constant parameter with an actual call to regular memcpy.
> > On a wider scale should we not look to do something similar in this file,
> > have calls to rte_memcpy with constant parameter always turn into a call to
> > regular memcpy? We used to have such a macro in older DPDK e.g.
> > from DPDK 1.8
> > 
> > http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp
> > y.h?h=v1.8.0#n171
> > 
> > This would elminiate the need to put in constant_p checks all through the
> > code.
> 
> The old macro in DPDK 1.8 was removed with the description "Remove slow glibc call for constant copies":
> https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475
> 
> Stephen believes that the memcpy() built-ins provided by compilers are faster than rte_memcpy() for constant size.
> I'm not convinced.
> Such a change should be backed up by performance tests, preferably for all supported compilers - especially the old compilers that come with some of the supported distros might not be as good as we would hope.
>

I would tend to agree with Stephen that whereever possible we should use
the built-in memcpy calls. Hence my suggestion of re-introducing the macro.
I'm not sure why it previously was seen as slower, it may be that the
compiler-expanded memcpy calls are not done beyond a certain size.
However, since we lack data, I'm ok with taking the changes in your patch
as-is.

With the above-flagged superfluous AVX512 check on AVX2 code removed:

Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-04-04 13:29       ` Bruce Richardson
@ 2024-04-04 15:37         ` Morten Brørup
  2024-04-04 15:55           ` Stephen Hemminger
  0 siblings, 1 reply; 31+ messages in thread
From: Morten Brørup @ 2024-04-04 15:37 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev

> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Thursday, 4 April 2024 15.29
> 
> On Thu, Apr 04, 2024 at 01:19:54PM +0200, Morten Brørup wrote:
> > > From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > Sent: Thursday, 4 April 2024 12.07
> > >
> > > On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote:
> > > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > > > In the case where the size is known to be 16 at build tine, omit the
> > > > duplicate copy.
> > > >
> > > > Reduced the amount of effectively copy-pasted code by using #ifdef
> > > > inside functions instead of outside functions.
> > > >
> > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > >
> > > Changes in general look good to me. Comments inline below.
> > >
> > > /Bruce
> > >
> > > > ---
> > > > v2:
> > > > * For GCC, version 11 is required for proper AVX handling;
> > > >   if older GCC version, treat AVX as SSE.
> > > >   Clang does not have this issue.
> > > >   Note: Original code always treated AVX as SSE, regardless of compiler.
> > > > * Do not add copyright. (Stephen Hemminger)
> > > > ---
> > > >  lib/eal/x86/include/rte_memcpy.h | 231 ++++++++-----------------------
> > > >  1 file changed, 56 insertions(+), 175 deletions(-)
> > > >
> > > > diff --git a/lib/eal/x86/include/rte_memcpy.h
> > > b/lib/eal/x86/include/rte_memcpy.h
> > > > index 72a92290e0..d1df841f5e 100644
> > > > --- a/lib/eal/x86/include/rte_memcpy.h
> > > > +++ b/lib/eal/x86/include/rte_memcpy.h
> > > > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t
> n)
> > > >  	return ret;
> > > >  }
> > > >
> > > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > > -
> > > > -#define ALIGNMENT_MASK 0x3F
> > > > -
> > > > -/**
> > > > - * AVX512 implementation below
> > > > - */
> > > > -
> > > >  /**
> > > >   * Copy 16 bytes from one location to another,
> > > >   * locations should not overlap.
> > > > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
> > > >  static __rte_always_inline void
> > > >  rte_mov32(uint8_t *dst, const uint8_t *src)
> > > >  {
> > > > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined
> __AVX2__
> > > || \
> > > > +		(defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) &&
> (GCC_VERSION
> > > < 110000)))
> > >
> > > I think we can drop the AVX512 checks here, since I'm not aware of any
> > > system where we'd have AVX512 but not AVX2 available, so just checking for
> > > AVX2 support should be sufficient.
> >
> > RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512:
> >
> https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memc
> py.h#L98
> >
> > Without it, the AVX2 version will be used, regardless if the CPU has AVX512.
> >
> > Also, there are some binutils bugs that might disable compilation for
> AVX512:
> > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4
> > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17
> >
> 
> Yes, I realise that, but the guard here is for an AVX2 block only, so there
> is no point in checking for AVX512 - it's AVX512 or AVX2.

Aha! Now I get your point:
Checking for AVX2 suffices for AVX2 code.

I didn't think of that when combining the copy-pasted code into one code block.

Well spotted! Thank you.

> 
> > >
> > > On the final compiler-based check, I don't strongly object to it, but I
> > > just wonder as to its real value. AVX2 was first introduced by Intel over
> 10
> > > years ago, and (from what I find in wikipedia), it's been in AMD CPUs
> since
> > > ~2015. While we did have CPUs still being produced without AVX2 since that
> > > time, they generally didn't have AVX1 either, only having SSE
> instructions.
> > > Therefore the number of systems which require this additional check is
> > > likely very small at this stage.
> > > That said, I'm ok to either keep or omit it at your choice.
> >
> > I kept it for consistency, and to support older compilers still officially
> supported by DPDK.
> >
> > I don't feel qualified to change support for CPU features; I'll leave that
> to the CPU vendors.
> > Also, I have no clue what has been produced by Intel and AMD. :-)
> >
> > > If you do keep
> > > it, how about putting the check once at the top of the file and using a
> > > single short define instead for the multiple places it's used e.g.
> > >
> > > #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION <
> > > 110000)))
> > > #define RTE_MEMCPY_AVX2
> > > #endif
> >
> > Much of the code reorganization in this patch was done with the intention to
> improve readability.
> >
> > And I don't think this suggestion improves readability; especially
> considering that RTE_MEMCPY_AVX512 is something manually defined.
> >
> > However, I get your point; and if the conditional was very long or very
> complex, I might agree to a "shadow" definition to keep it short.
> >
> 
> I just find it long enough that duplication of it seems painful. :-) I'd
> rather we check once at the top if we can use an AVX copy vs SSE, rather
> than duplicate the compiler version checks multiple times.

OK. And I suppose the same principle as above applies:
AVX2 implies AVX, so checking for AVX suffices.

I suppose your suggested name RTE_MEMCPY_AVX2 was a typo, and will define it as RTE_MEMCPY_AVX.

> 
> 
> > >
> > >
> > > >  	__m256i ymm0;
> > > >
> > > >  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
> > > >  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> > > > +#else /* SSE implementation */
> > > > +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 *
> 16);
> > > > +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 *
> 16);
> > > > +#endif
> > > >  }
> > > >
> > > >  /**
> > > > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
> > > >  static __rte_always_inline void
> > > >  rte_mov64(uint8_t *dst, const uint8_t *src)
> > > >  {
> > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > >  	__m512i zmm0;
> > > >
> > > >  	zmm0 = _mm512_loadu_si512((const void *)src);
> > > >  	_mm512_storeu_si512((void *)dst, zmm0);
> > > > +#else /* AVX2, AVX & SSE implementation */
> > > > +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 *
> 32);
> > > > +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 *
> 32);
> > > > +#endif
> > > >  }
> > > >
> > > >  /**
> > > > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
> > > >  static __rte_always_inline void
> > > >  rte_mov256(uint8_t *dst, const uint8_t *src)
> > > >  {
> > > > -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> > > > -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> > > > -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> > > > -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> > > > +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> > > > +	rte_mov128(dst + 1 * 128, src + 1 * 128);
> > > >  }
> > > >
> > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> > > > +
> > > > +/**
> > > > + * AVX512 implementation below
> > > > + */
> > > > +
> > > > +#define ALIGNMENT_MASK 0x3F
> > > > +
> > > >  /**
> > > >   * Copy 128-byte blocks from one location to another,
> > > >   * locations should not overlap.
> > > > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src,
> size_t
> > > n)
> > > >  	/**
> > > >  	 * Fast way when copy size doesn't exceed 512 bytes
> > > >  	 */
> > > > +	if (__builtin_constant_p(n) && n == 32) {
> > > > +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> > > > +		return ret;
> > > > +	}
> > >
> > > There's an outstanding patchset from Stephen to replace all use of
> > > rte_memcpy with a constant parameter with an actual call to regular
> memcpy.
> > > On a wider scale should we not look to do something similar in this file,
> > > have calls to rte_memcpy with constant parameter always turn into a call
> to
> > > regular memcpy? We used to have such a macro in older DPDK e.g.
> > > from DPDK 1.8
> > >
> > >
> http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp
> > > y.h?h=v1.8.0#n171
> > >
> > > This would elminiate the need to put in constant_p checks all through the
> > > code.
> >
> > The old macro in DPDK 1.8 was removed with the description "Remove slow
> glibc call for constant copies":
> >
> https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_me
> mcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475
> >
> > Stephen believes that the memcpy() built-ins provided by compilers are
> faster than rte_memcpy() for constant size.
> > I'm not convinced.
> > Such a change should be backed up by performance tests, preferably for all
> supported compilers - especially the old compilers that come with some of the
> supported distros might not be as good as we would hope.
> >
> 
> I would tend to agree with Stephen that whereever possible we should use
> the built-in memcpy calls. Hence my suggestion of re-introducing the macro.

I agree in principle, but strongly prefer data to back up such changes in the fast path.

> I'm not sure why it previously was seen as slower, it may be that the
> compiler-expanded memcpy calls are not done beyond a certain size.
> However, since we lack data, I'm ok with taking the changes in your patch
> as-is.
> 
> With the above-flagged superfluous AVX512 check on AVX2 code removed:
> 
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Thanks. I'll provide a v3 patch.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-04-04 15:37         ` Morten Brørup
@ 2024-04-04 15:55           ` Stephen Hemminger
  2024-04-04 16:10             ` Morten Brørup
  0 siblings, 1 reply; 31+ messages in thread
From: Stephen Hemminger @ 2024-04-04 15:55 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Bruce Richardson, konstantin.v.ananyev, mattias.ronnblom, dev

On Thu, 4 Apr 2024 17:37:53 +0200
Morten Brørup <mb@smartsharesystems.com> wrote:

> > I would tend to agree with Stephen that whereever possible we should use
> > the built-in memcpy calls. Hence my suggestion of re-introducing the macro.  
> 
> I agree in principle, but strongly prefer data to back up such changes in the fast path.


godbolt.org shows same instructions for the cases I looked at.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-04-04 15:55           ` Stephen Hemminger
@ 2024-04-04 16:10             ` Morten Brørup
  2024-04-04 16:55               ` Bruce Richardson
  0 siblings, 1 reply; 31+ messages in thread
From: Morten Brørup @ 2024-04-04 16:10 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Bruce Richardson, konstantin.v.ananyev, mattias.ronnblom, dev

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Thursday, 4 April 2024 17.56
> 
> On Thu, 4 Apr 2024 17:37:53 +0200
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > > I would tend to agree with Stephen that whereever possible we should use
> > > the built-in memcpy calls. Hence my suggestion of re-introducing the
> macro.
> >
> > I agree in principle, but strongly prefer data to back up such changes in
> the fast path.
> 
> 
> godbolt.org shows same instructions for the cases I looked at.

Such a fundamental change belongs in a separate patch, with a description of what has been confirmed to generate same instructions or otherwise tested.
On behalf of the distros, I'm mostly worried about older compilers.

Anyway, this patch also tidies up the code, removing a lot of copy-paste, so I think we should go ahead with this patch first.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance
  2024-04-04 16:10             ` Morten Brørup
@ 2024-04-04 16:55               ` Bruce Richardson
  0 siblings, 0 replies; 31+ messages in thread
From: Bruce Richardson @ 2024-04-04 16:55 UTC (permalink / raw)
  To: Morten Brørup
  Cc: Stephen Hemminger, konstantin.v.ananyev, mattias.ronnblom, dev

On Thu, Apr 04, 2024 at 06:10:32PM +0200, Morten Brørup wrote:
> > From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> > Sent: Thursday, 4 April 2024 17.56
> > 
> > On Thu, 4 Apr 2024 17:37:53 +0200
> > Morten Brørup <mb@smartsharesystems.com> wrote:
> > 
> > > > I would tend to agree with Stephen that whereever possible we should use
> > > > the built-in memcpy calls. Hence my suggestion of re-introducing the
> > macro.
> > >
> > > I agree in principle, but strongly prefer data to back up such changes in
> > the fast path.
> > 
> > 
> > godbolt.org shows same instructions for the cases I looked at.
> 
> Such a fundamental change belongs in a separate patch, with a description of what has been confirmed to generate same instructions or otherwise tested.
> On behalf of the distros, I'm mostly worried about older compilers.
> 
> Anyway, this patch also tidies up the code, removing a lot of copy-paste, so I think we should go ahead with this patch first.
> 
I agree. Best to keep such changes in separate patches.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (3 preceding siblings ...)
  2024-03-03  9:46 ` [PATCH v2] " Morten Brørup
@ 2024-03-03 16:05 ` Stephen Hemminger
  2024-04-05 12:46 ` [PATCH v3] " Morten Brørup
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Stephen Hemminger @ 2024-03-03 16:05 UTC (permalink / raw)
  To: Morten Brørup
  Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev

Another option would be to just do what PPC already does.
The ENA part is because it has some garbage trying to use memcpy
always (which is one of those bad ideas).

From 74e7ab929e61e0481f6e0214d4d06a716b2f7d79 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 3 Mar 2024 08:02:07 -0800
Subject: [PATCH] rte_memcpy: use builtin memcpy for fixed sizes

This makes x86 arch do same thing as PPC, and also allows
code checkers to see memcpy issues.  It shows a pre-existing
bug in ipsec test now.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/ena/base/ena_plat_dpdk.h |  9 +-----
 lib/eal/x86/include/rte_memcpy.h     | 45 +++++++++++++++-------------
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ena/base/ena_plat_dpdk.h b/drivers/net/ena/base/ena_plat_dpdk.h
index 14bf582a451f..997e6aa3dfbd 100644
--- a/drivers/net/ena/base/ena_plat_dpdk.h
+++ b/drivers/net/ena/base/ena_plat_dpdk.h
@@ -70,14 +70,7 @@ typedef uint64_t dma_addr_t;
 #define ENA_UDELAY(x) rte_delay_us_block(x)
 
 #define ENA_TOUCH(x) ((void)(x))
-/* Redefine memcpy with caution: rte_memcpy can be simply aliased to memcpy, so
- * make the redefinition only if it's safe (and beneficial) to do so.
- */
-#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64_MEMCPY) || \
-	defined(RTE_ARCH_ARM_NEON_MEMCPY)
-#undef memcpy
-#define memcpy rte_memcpy
-#endif
+
 #define wmb rte_wmb
 #define rmb rte_rmb
 #define mb rte_mb
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e05d..aab30be0eeb9 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,24 +27,6 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
-/**
- * Copy bytes from one location to another. The locations must not overlap.
- *
- * @note This is implemented as a macro, so it's address should not be taken
- * and care is needed as parameter expressions may be evaluated multiple times.
- *
- * @param dst
- *   Pointer to the destination of the data.
- * @param src
- *   Pointer to the source data.
- * @param n
- *   Number of bytes to copy.
- * @return
- *   Pointer to the destination data.
- */
-static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n);
-
 /**
  * Copy bytes from one location to another,
  * locations should not overlap.
@@ -859,8 +841,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-static __rte_always_inline void *
-rte_memcpy(void *dst, const void *src, size_t n)
+static inline void *
+rte_memcpy_func(void *dst, const void *src, size_t n)
 {
 	if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
 		return rte_memcpy_aligned(dst, src, n);
@@ -868,6 +850,29 @@ rte_memcpy(void *dst, const void *src, size_t n)
 		return rte_memcpy_generic(dst, src, n);
 }
 
+
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+#define rte_memcpy(dst, src, n)              \
+	__extension__ ({                     \
+	(__builtin_constant_p(n)) ?          \
+	memcpy((dst), (src), (n)) :          \
+	rte_memcpy_func((dst), (src), (n)); })
+
+
 #undef ALIGNMENT_MASK
 
 #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000)
-- 
2.43.0




^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v3] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (4 preceding siblings ...)
  2024-03-03 16:05 ` [PATCH] " Stephen Hemminger
@ 2024-04-05 12:46 ` Morten Brørup
  2024-04-05 13:17   ` Bruce Richardson
  2024-04-05 13:48 ` [PATCH v4] " Morten Brørup
                   ` (6 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Morten Brørup @ 2024-04-05 12:46 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 234 ++++++++-----------------------
 1 file changed, 59 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..b56bc46713 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,11 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/* GCC prior to version 11 doesn't compile AVX properly, so use SSE instead. */
+#if defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +96,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +116,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +134,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +163,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +244,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +336,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +393,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +468,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
 /**
- * SSE & AVX implementation below
+ * SSE implementation below
  */
 
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
-/**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +588,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +702,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +717,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3] eal/x86: improve rte_memcpy const size 16 performance
  2024-04-05 12:46 ` [PATCH v3] " Morten Brørup
@ 2024-04-05 13:17   ` Bruce Richardson
  0 siblings, 0 replies; 31+ messages in thread
From: Bruce Richardson @ 2024-04-05 13:17 UTC (permalink / raw)
  To: Morten Brørup; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev

On Fri, Apr 05, 2024 at 02:46:28PM +0200, Morten Brørup wrote:
> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build tine, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
> v3:
> * AVX2 is a superset of AVX;
>   for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
> * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
>   check for older GCC version. (Bruce Richardson)
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)
> ---
>  lib/eal/x86/include/rte_memcpy.h | 234 ++++++++-----------------------
>  1 file changed, 59 insertions(+), 175 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..b56bc46713 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -27,6 +27,11 @@ extern "C" {
>  #pragma GCC diagnostic ignored "-Wstringop-overflow"
>  #endif
>  
> +/* GCC prior to version 11 doesn't compile AVX properly, so use SSE instead. */
> +#if defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
> +#define RTE_MEMCPY_AVX
> +#endif
> +

Strictly speaking, to have the same behaviour as before, you need to check
for AVX2 also, since the issue with GCC < 11 is for (AVX && !AVX2), i.e. if
AVX2 is supported, all compilers are fine.

My suggestion:
#ifdef __AVX2__
#define RTE_MEMCPY_AVX
#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
#define RTE_MEMCPY_AVX
#endif

You can obviously merge the two branches if you want, but I find the split
slightly easier to follow, than a mix of && and || with brackets for
precedence.

Final alternative I see, you can change defined(RTE_MEMCPY_AVX) to 
"defined(__AVX2__) || defined(RTE_MEMCPY_AVX)" each place it's used.

/Bruce


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v4] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (5 preceding siblings ...)
  2024-04-05 12:46 ` [PATCH v3] " Morten Brørup
@ 2024-04-05 13:48 ` Morten Brørup
  2024-05-27 13:15 ` Morten Brørup
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-04-05 13:48 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v4:
* There are no problems compiling AVX2, only AVX. (Bruce Richardson)
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
 1 file changed, 64 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..d687aa7756 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,16 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/*
+ * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
+ * There are no problems with AVX2.
+ */
+#if defined __AVX2__
+#define RTE_MEMCPY_AVX
+#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * SSE implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
 
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v4] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (6 preceding siblings ...)
  2024-04-05 13:48 ` [PATCH v4] " Morten Brørup
@ 2024-05-27 13:15 ` Morten Brørup
  2024-05-27 13:16 ` [PATCH v5] " Morten Brørup
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-05-27 13:15 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v4:
* There are no problems compiling AVX2, only AVX. (Bruce Richardson)
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
 1 file changed, 64 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..d687aa7756 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,16 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/*
+ * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
+ * There are no problems with AVX2.
+ */
+#if defined __AVX2__
+#define RTE_MEMCPY_AVX
+#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * SSE implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
 
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__builtin_constant_p(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__builtin_constant_p(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__builtin_constant_p(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v5] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (7 preceding siblings ...)
  2024-05-27 13:15 ` Morten Brørup
@ 2024-05-27 13:16 ` Morten Brørup
  2024-05-27 14:13   ` Morten Brørup
  2024-05-28  6:18 ` Morten Brørup
                   ` (3 subsequent siblings)
  12 siblings, 1 reply; 31+ messages in thread
From: Morten Brørup @ 2024-05-27 13:16 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Depends-on: series-31578 ("provide toolchain abstracted
__builtin_constant_p")

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v5:
* Fix for building with MSVC:
  Use __rte_constant() instead of __builtin_constant_p().
  Add dependency on patch providing __rte_constant().
v4:
* There are no problems compiling AVX2, only AVX. (Bruce Richardson)
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
 1 file changed, 64 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..1619a8f296 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,16 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/*
+ * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
+ * There are no problems with AVX2.
+ */
+#if defined __AVX2__
+#define RTE_MEMCPY_AVX
+#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * SSE implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
 
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v5] eal/x86: improve rte_memcpy const size 16 performance
  2024-05-27 13:16 ` [PATCH v5] " Morten Brørup
@ 2024-05-27 14:13   ` Morten Brørup
  0 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-05-27 14:13 UTC (permalink / raw)
  To: dev

Recheck-request: iol-testing


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v5] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (8 preceding siblings ...)
  2024-05-27 13:16 ` [PATCH v5] " Morten Brørup
@ 2024-05-28  6:18 ` Morten Brørup
  2024-05-28  6:22 ` [PATCH v6] " Morten Brørup
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-05-28  6:18 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p")

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v6:
* Don't wrap depends on line. It seems not to have been understood.
v5:
* Fix for building with MSVC:
  Use __rte_constant() instead of __builtin_constant_p().
  Add dependency on patch providing __rte_constant().
v4:
* There are no problems compiling AVX2, only AVX. (Bruce Richardson)
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
 1 file changed, 64 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..1619a8f296 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,16 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/*
+ * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
+ * There are no problems with AVX2.
+ */
+#if defined __AVX2__
+#define RTE_MEMCPY_AVX
+#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * SSE implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
 
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v6] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (9 preceding siblings ...)
  2024-05-28  6:18 ` Morten Brørup
@ 2024-05-28  6:22 ` Morten Brørup
  2024-05-28  7:05 ` [PATCH v7] " Morten Brørup
  2024-05-30 15:41 ` [PATCH v8] " Morten Brørup
  12 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-05-28  6:22 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Depends-on: series-31578 ("provide toolchain abstracted ...")

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v6:
* Don't wrap depends on line. It seems not to have been understood.
v5:
* Fix for building with MSVC:
  Use __rte_constant() instead of __builtin_constant_p().
  Add dependency on patch providing __rte_constant().
v4:
* There are no problems compiling AVX2, only AVX. (Bruce Richardson)
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
 1 file changed, 64 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..1619a8f296 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,16 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/*
+ * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
+ * There are no problems with AVX2.
+ */
+#if defined __AVX2__
+#define RTE_MEMCPY_AVX
+#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * SSE implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
 
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v7] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (10 preceding siblings ...)
  2024-05-28  6:22 ` [PATCH v6] " Morten Brørup
@ 2024-05-28  7:05 ` Morten Brørup
  2024-05-30 15:41 ` [PATCH v8] " Morten Brørup
  12 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-05-28  7:05 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla
  Cc: mattias.ronnblom, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build tine, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
Depends-on: patch-138647 ("eal: provide macro for GCC builtin constant intrinsic")

v7:
* Keep trying to fix that CI does not understand the dependency...
  Depend on patch instead of series.
  Move dependency out of the patch description itself, and down to the
  version log.
v6:
* Trying to fix CI not understanding dependency...
  Don't wrap dependency line.
v5:
* Fix for building with MSVC:
  Use __rte_constant() instead of __builtin_constant_p().
  Add dependency on patch providing __rte_constant().
v4:
* There are no problems compiling AVX2, only AVX. (Bruce Richardson)
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
 1 file changed, 64 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..1619a8f296 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,16 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/*
+ * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
+ * There are no problems with AVX2.
+ */
+#if defined __AVX2__
+#define RTE_MEMCPY_AVX
+#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * SSE implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
 
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance
  2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
                   ` (11 preceding siblings ...)
  2024-05-28  7:05 ` [PATCH v7] " Morten Brørup
@ 2024-05-30 15:41 ` Morten Brørup
  2024-06-10  9:05   ` Morten Brørup
  2024-06-10 13:40   ` Konstantin Ananyev
  12 siblings, 2 replies; 31+ messages in thread
From: Morten Brørup @ 2024-05-30 15:41 UTC (permalink / raw)
  To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla
  Cc: mattias.ronnblom, aconole, dev, Morten Brørup

When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
In the case where the size is known to be 16 at build time, omit the
duplicate copy.

Reduced the amount of effectively copy-pasted code by using #ifdef
inside functions instead of outside functions.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p")

v8:
* Keep trying to fix that CI does not understand the dependency...
  Depend on series instead of patch. Github only understands series.
* Fix typo in patch description.
v7:
* Keep trying to fix that CI does not understand the dependency...
  Depend on patch instead of series.
  Move dependency out of the patch description itself, and down to the
  version log.
v6:
* Trying to fix CI not understanding dependency...
  Don't wrap dependency line.
v5:
* Fix for building with MSVC:
  Use __rte_constant() instead of __builtin_constant_p().
  Add dependency on patch providing __rte_constant().
v4:
* There are no problems compiling AVX2, only AVX. (Bruce Richardson)
v3:
* AVX2 is a superset of AVX;
  for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
* Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
  check for older GCC version. (Bruce Richardson)
v2:
* For GCC, version 11 is required for proper AVX handling;
  if older GCC version, treat AVX as SSE.
  Clang does not have this issue.
  Note: Original code always treated AVX as SSE, regardless of compiler.
* Do not add copyright. (Stephen Hemminger)
---
 lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
 1 file changed, 64 insertions(+), 175 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..1619a8f296 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -27,6 +27,16 @@ extern "C" {
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
 
+/*
+ * GCC older than version 11 doesn't compile AVX properly, so use SSE instead.
+ * There are no problems with AVX2.
+ */
+#if defined __AVX2__
+#define RTE_MEMCPY_AVX
+#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))
+#define RTE_MEMCPY_AVX
+#endif
+
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
 	return ret;
 }
 
-#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
-
-#define ALIGNMENT_MASK 0x3F
-
-/**
- * AVX512 implementation below
- */
-
 /**
  * Copy 16 bytes from one location to another,
  * locations should not overlap.
@@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
+#if defined RTE_MEMCPY_AVX
 	__m256i ymm0;
 
 	ymm0 = _mm256_loadu_si256((const __m256i *)src);
 	_mm256_storeu_si256((__m256i *)dst, ymm0);
+#else /* SSE implementation */
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+#endif
 }
 
 /**
@@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 	__m512i zmm0;
 
 	zmm0 = _mm512_loadu_si512((const void *)src);
 	_mm512_storeu_si512((void *)dst, zmm0);
+#else /* AVX2, AVX & SSE implementation */
+	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
+	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+#endif
 }
 
 /**
@@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
 static __rte_always_inline void
 rte_mov256(uint8_t *dst, const uint8_t *src)
 {
-	rte_mov64(dst + 0 * 64, src + 0 * 64);
-	rte_mov64(dst + 1 * 64, src + 1 * 64);
-	rte_mov64(dst + 2 * 64, src + 2 * 64);
-	rte_mov64(dst + 3 * 64, src + 3 * 64);
+	rte_mov128(dst + 0 * 128, src + 0 * 128);
+	rte_mov128(dst + 1 * 128, src + 1 * 128);
 }
 
+#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
+
+/**
+ * AVX512 implementation below
+ */
+
+#define ALIGNMENT_MASK 0x3F
+
 /**
  * Copy 128-byte blocks from one location to another,
  * locations should not overlap.
@@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 512 bytes
 	 */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				  (const uint8_t *)src - 16 + n);
 		return ret;
 	}
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
@@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	goto COPY_BLOCK_128_BACK63;
 }
 
-#elif defined __AVX2__
-
-#define ALIGNMENT_MASK 0x1F
-
-/**
- * AVX2 implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	__m256i ymm0;
-
-	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
-	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
-}
+#elif defined RTE_MEMCPY_AVX
 
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-}
-
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
+ * AVX implementation below
  */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-}
 
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
-	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
-	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
-	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
-	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
-	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
-	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
-	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
-}
+#define ALIGNMENT_MASK 0x1F
 
 /**
  * Copy 128-byte blocks from one location to another,
@@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	/**
 	 * Fast way when copy size doesn't exceed 256 bytes
 	 */
-	if (n <= 32) {
-		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n,
-				(const uint8_t *)src - 16 + n);
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		return ret;
 	}
-	if (n <= 48) {
+	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 		return ret;
@@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 
 #else /* __AVX512F__ */
 
-#define ALIGNMENT_MASK 0x0F
-
-/**
- * SSE & AVX implementation below
- */
-
-/**
- * Copy 16 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-	__m128i xmm0;
-
-	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
-	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
-}
-
-/**
- * Copy 32 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-}
-
 /**
- * Copy 64 bytes from one location to another,
- * locations should not overlap.
+ * SSE implementation below
  */
-static __rte_always_inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-}
 
-/**
- * Copy 128 bytes from one location to another,
- * locations should not overlap.
- */
-static __rte_always_inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-}
-
-/**
- * Copy 256 bytes from one location to another,
- * locations should not overlap.
- */
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
-	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
-	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
-	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
-	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
-	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
-	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
-	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
-	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
-	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
-	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
-	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
-	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
-	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
-	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
-	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
-}
+#define ALIGNMENT_MASK 0x0F
 
 /**
  * Macro for copying unaligned block from one location to another with constant load offset,
@@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 */
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
-		return ret;
-	}
-	if (n <= 48) {
-		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
-		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
+		if (n > 48)
+			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
 		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n);
 		return ret;
 	}
@@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 16 <= size <= 32 bytes */
+	if (__rte_constant(n) && n == 32) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 32) {
 		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		if (__rte_constant(n) && n == 16)
+			return ret; /* avoid (harmless) duplicate copy */
 		rte_mov16((uint8_t *)dst - 16 + n,
 				(const uint8_t *)src - 16 + n);
 
@@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
 	}
 
 	/* Copy 32 < size <= 64 bytes */
+	if (__rte_constant(n) && n == 64) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		return ret;
+	}
 	if (n <= 64) {
 		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
 		rte_mov32((uint8_t *)dst - 32 + n,
-- 
2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance
  2024-05-30 15:41 ` [PATCH v8] " Morten Brørup
@ 2024-06-10  9:05   ` Morten Brørup
  2024-06-10 13:40   ` Konstantin Ananyev
  1 sibling, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-06-10  9:05 UTC (permalink / raw)
  To: konstantin.v.ananyev, stephen
  Cc: mattias.ronnblom, roretzla, dev, bruce.richardson

PING for review.

The CI failures can be ignored: Most of the CI doesn't support the Depends-on tag, and this patch uses __rte_constant(), provided by Tyler's patch series [1].

[1]: https://inbox.dpdk.org/dev/1710970416-27841-1-git-send-email-roretzla@linux.microsoft.com/

-Morten

> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Thursday, 30 May 2024 17.41
> 
> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build time, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
> Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p")
> 
> v8:
> * Keep trying to fix that CI does not understand the dependency...
>   Depend on series instead of patch. Github only understands series.
> * Fix typo in patch description.
> v7:
> * Keep trying to fix that CI does not understand the dependency...
>   Depend on patch instead of series.
>   Move dependency out of the patch description itself, and down to the
>   version log.
> v6:
> * Trying to fix CI not understanding dependency...
>   Don't wrap dependency line.
> v5:
> * Fix for building with MSVC:
>   Use __rte_constant() instead of __builtin_constant_p().
>   Add dependency on patch providing __rte_constant().
> v4:
> * There are no problems compiling AVX2, only AVX. (Bruce Richardson)
> v3:
> * AVX2 is a superset of AVX;
>   for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
> * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
>   check for older GCC version. (Bruce Richardson)
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)
> ---
>  lib/eal/x86/include/rte_memcpy.h | 239 +++++++++----------------------
>  1 file changed, 64 insertions(+), 175 deletions(-)
> 
> diff --git a/lib/eal/x86/include/rte_memcpy.h
> b/lib/eal/x86/include/rte_memcpy.h
> index 72a92290e0..1619a8f296 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -27,6 +27,16 @@ extern "C" {
>  #pragma GCC diagnostic ignored "-Wstringop-overflow"
>  #endif
> 
> +/*
> + * GCC older than version 11 doesn't compile AVX properly, so use SSE
> instead.
> + * There are no problems with AVX2.
> + */
> +#if defined __AVX2__
> +#define RTE_MEMCPY_AVX
> +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION <
> 110000))
> +#define RTE_MEMCPY_AVX
> +#endif
> +
>  /**
>   * Copy bytes from one location to another. The locations must not overlap.
>   *
> @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n)
>  	return ret;
>  }
> 
> -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> -
> -#define ALIGNMENT_MASK 0x3F
> -
> -/**
> - * AVX512 implementation below
> - */
> -
>  /**
>   * Copy 16 bytes from one location to another,
>   * locations should not overlap.
> @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src)
>  {
> +#if defined RTE_MEMCPY_AVX
>  	__m256i ymm0;
> 
>  	ymm0 = _mm256_loadu_si256((const __m256i *)src);
>  	_mm256_storeu_si256((__m256i *)dst, ymm0);
> +#else /* SSE implementation */
> +	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> +	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> +#endif
>  }
> 
>  /**
> @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov64(uint8_t *dst, const uint8_t *src)
>  {
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>  	__m512i zmm0;
> 
>  	zmm0 = _mm512_loadu_si512((const void *)src);
>  	_mm512_storeu_si512((void *)dst, zmm0);
> +#else /* AVX2, AVX & SSE implementation */
> +	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> +	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> +#endif
>  }
> 
>  /**
> @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src)
>  static __rte_always_inline void
>  rte_mov256(uint8_t *dst, const uint8_t *src)
>  {
> -	rte_mov64(dst + 0 * 64, src + 0 * 64);
> -	rte_mov64(dst + 1 * 64, src + 1 * 64);
> -	rte_mov64(dst + 2 * 64, src + 2 * 64);
> -	rte_mov64(dst + 3 * 64, src + 3 * 64);
> +	rte_mov128(dst + 0 * 128, src + 0 * 128);
> +	rte_mov128(dst + 1 * 128, src + 1 * 128);
>  }
> 
> +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
> +
> +/**
> + * AVX512 implementation below
> + */
> +
> +#define ALIGNMENT_MASK 0x3F
> +
>  /**
>   * Copy 128-byte blocks from one location to another,
>   * locations should not overlap.
> @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 512 bytes
>  	 */
> +	if (__rte_constant(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__rte_constant(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				  (const uint8_t *)src - 16 + n);
>  		return ret;
>  	}
> +	if (__rte_constant(n) && n == 64) {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		rte_mov32((uint8_t *)dst - 32 + n,
> @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	goto COPY_BLOCK_128_BACK63;
>  }
> 
> -#elif defined __AVX2__
> -
> -#define ALIGNMENT_MASK 0x1F
> -
> -/**
> - * AVX2 implementation below
> - */
> -
> -/**
> - * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> -{
> -	__m128i xmm0;
> -
> -	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
> -	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
> -}
> -
> -/**
> - * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> -{
> -	__m256i ymm0;
> -
> -	ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src);
> -	_mm256_storeu_si256((__m256i *)(void *)dst, ymm0);
> -}
> +#elif defined RTE_MEMCPY_AVX
> 
>  /**
> - * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -}
> -
> -/**
> - * Copy 128 bytes from one location to another,
> - * locations should not overlap.
> + * AVX implementation below
>   */
> -static __rte_always_inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> -}
> 
> -/**
> - * Copy 256 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
> -	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
> -	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
> -	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
> -	rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32);
> -	rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32);
> -	rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32);
> -	rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32);
> -}
> +#define ALIGNMENT_MASK 0x1F
> 
>  /**
>   * Copy 128-byte blocks from one location to another,
> @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	/**
>  	 * Fast way when copy size doesn't exceed 256 bytes
>  	 */
> -	if (n <= 32) {
> -		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n,
> -				(const uint8_t *)src - 16 + n);
> +	if (__rte_constant(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		return ret;
>  	}
> -	if (n <= 48) {
> +	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16);
> +		if (__rte_constant(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
>  		return ret;
> @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> 
>  #else /* __AVX512F__ */
> 
> -#define ALIGNMENT_MASK 0x0F
> -
> -/**
> - * SSE & AVX implementation below
> - */
> -
> -/**
> - * Copy 16 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov16(uint8_t *dst, const uint8_t *src)
> -{
> -	__m128i xmm0;
> -
> -	xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src);
> -	_mm_storeu_si128((__m128i *)(void *)dst, xmm0);
> -}
> -
> -/**
> - * Copy 32 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov32(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -}
> -
>  /**
> - * Copy 64 bytes from one location to another,
> - * locations should not overlap.
> + * SSE implementation below
>   */
> -static __rte_always_inline void
> -rte_mov64(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -}
> 
> -/**
> - * Copy 128 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static __rte_always_inline void
> -rte_mov128(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -}
> -
> -/**
> - * Copy 256 bytes from one location to another,
> - * locations should not overlap.
> - */
> -static inline void
> -rte_mov256(uint8_t *dst, const uint8_t *src)
> -{
> -	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
> -	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
> -	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
> -	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
> -	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
> -	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
> -	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
> -	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
> -	rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16);
> -	rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16);
> -	rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16);
> -	rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16);
> -	rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16);
> -	rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16);
> -	rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16);
> -	rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16);
> -}
> +#define ALIGNMENT_MASK 0x0F
> 
>  /**
>   * Macro for copying unaligned block from one location to another with
> constant load offset,
> @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
>  	 */
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
> -		return ret;
> -	}
> -	if (n <= 48) {
> -		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		if (__rte_constant(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
>  		return ret;
>  	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> -		rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
> +		if (n > 48)
> +			rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32);
>  		rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 +
> n);
>  		return ret;
>  	}
> @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  	}
> 
>  	/* Copy 16 <= size <= 32 bytes */
> +	if (__rte_constant(n) && n == 32) {
> +		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 32) {
>  		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
> +		if (__rte_constant(n) && n == 16)
> +			return ret; /* avoid (harmless) duplicate copy */
>  		rte_mov16((uint8_t *)dst - 16 + n,
>  				(const uint8_t *)src - 16 + n);
> 
> @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n)
>  	}
> 
>  	/* Copy 32 < size <= 64 bytes */
> +	if (__rte_constant(n) && n == 64) {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		return ret;
> +	}
>  	if (n <= 64) {
>  		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
>  		rte_mov32((uint8_t *)dst - 32 + n,
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance
  2024-05-30 15:41 ` [PATCH v8] " Morten Brørup
  2024-06-10  9:05   ` Morten Brørup
@ 2024-06-10 13:40   ` Konstantin Ananyev
  2024-06-10 13:59     ` Morten Brørup
  1 sibling, 1 reply; 31+ messages in thread
From: Konstantin Ananyev @ 2024-06-10 13:40 UTC (permalink / raw)
  To: Morten Brørup, bruce.richardson, konstantin.v.ananyev,
	stephen, roretzla
  Cc: mattias.ronnblom, aconole, dev



> When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> In the case where the size is known to be 16 at build time, omit the
> duplicate copy.
> 
> Reduced the amount of effectively copy-pasted code by using #ifdef
> inside functions instead of outside functions.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
> Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p")
> 
> v8:
> * Keep trying to fix that CI does not understand the dependency...
>   Depend on series instead of patch. Github only understands series.
> * Fix typo in patch description.
> v7:
> * Keep trying to fix that CI does not understand the dependency...
>   Depend on patch instead of series.
>   Move dependency out of the patch description itself, and down to the
>   version log.
> v6:
> * Trying to fix CI not understanding dependency...
>   Don't wrap dependency line.
> v5:
> * Fix for building with MSVC:
>   Use __rte_constant() instead of __builtin_constant_p().
>   Add dependency on patch providing __rte_constant().
> v4:
> * There are no problems compiling AVX2, only AVX. (Bruce Richardson)
> v3:
> * AVX2 is a superset of AVX;
>   for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
> * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
>   check for older GCC version. (Bruce Richardson)
> v2:
> * For GCC, version 11 is required for proper AVX handling;
>   if older GCC version, treat AVX as SSE.
>   Clang does not have this issue.
>   Note: Original code always treated AVX as SSE, regardless of compiler.
> * Do not add copyright. (Stephen Hemminger)

Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>

The code change itself -  LGTM.
Out of interest - do you expect any perf diff with these changes?
On my box I didn’t see any with 'memcpy_perf_autotest'.
Konstantin



^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance
  2024-06-10 13:40   ` Konstantin Ananyev
@ 2024-06-10 13:59     ` Morten Brørup
  0 siblings, 0 replies; 31+ messages in thread
From: Morten Brørup @ 2024-06-10 13:59 UTC (permalink / raw)
  To: Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev,
	stephen, roretzla
  Cc: mattias.ronnblom, aconole, dev

> From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com]
> Sent: Monday, 10 June 2024 15.40
> 
> > When the rte_memcpy() size is 16, the same 16 bytes are copied twice.
> > In the case where the size is known to be 16 at build time, omit the
> > duplicate copy.
> >
> > Reduced the amount of effectively copy-pasted code by using #ifdef
> > inside functions instead of outside functions.
> >
> > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > ---
> > Depends-on: series-31578 ("provide toolchain abstracted
> __builtin_constant_p")
> >
> > v8:
> > * Keep trying to fix that CI does not understand the dependency...
> >   Depend on series instead of patch. Github only understands series.
> > * Fix typo in patch description.
> > v7:
> > * Keep trying to fix that CI does not understand the dependency...
> >   Depend on patch instead of series.
> >   Move dependency out of the patch description itself, and down to the
> >   version log.
> > v6:
> > * Trying to fix CI not understanding dependency...
> >   Don't wrap dependency line.
> > v5:
> > * Fix for building with MSVC:
> >   Use __rte_constant() instead of __builtin_constant_p().
> >   Add dependency on patch providing __rte_constant().
> > v4:
> > * There are no problems compiling AVX2, only AVX. (Bruce Richardson)
> > v3:
> > * AVX2 is a superset of AVX;
> >   for a block of AVX code, testing for AVX suffices. (Bruce Richardson)
> > * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the
> >   check for older GCC version. (Bruce Richardson)
> > v2:
> > * For GCC, version 11 is required for proper AVX handling;
> >   if older GCC version, treat AVX as SSE.
> >   Clang does not have this issue.
> >   Note: Original code always treated AVX as SSE, regardless of compiler.
> > * Do not add copyright. (Stephen Hemminger)
> 
> Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
> 
> The code change itself -  LGTM.
> Out of interest - do you expect any perf diff with these changes?

I don't expect a significant perf diff with these changes, not even for the affected special cases. But the generated code (for the affected cases) is smaller.

Stephen noticed that the code generated from rte_memcpy() was inefficient in some cases [1], so I decided to fix it.

[1]: https://inbox.dpdk.org/dev/20240302090207.428d4853@hermes.local/

The code generated from rte_memcpy() was not incorrect, only slightly inefficient (for the affected cases), so the patch is not a bugfix in need of backporting.

> On my box I didn’t see any with 'memcpy_perf_autotest'.
> Konstantin
> 


^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2024-06-10 13:59 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup
2024-03-03  0:38 ` Morten Brørup
2024-03-03  5:40 ` Stephen Hemminger
2024-03-03  5:47   ` Stephen Hemminger
2024-03-03  5:58     ` Stephen Hemminger
2024-03-03  5:58   ` Stephen Hemminger
2024-03-03 10:07     ` Morten Brørup
2024-03-03  5:41 ` Stephen Hemminger
2024-03-03  9:46 ` [PATCH v2] " Morten Brørup
2024-04-04  9:18   ` Morten Brørup
2024-04-04 10:07   ` Bruce Richardson
2024-04-04 11:19     ` Morten Brørup
2024-04-04 13:29       ` Bruce Richardson
2024-04-04 15:37         ` Morten Brørup
2024-04-04 15:55           ` Stephen Hemminger
2024-04-04 16:10             ` Morten Brørup
2024-04-04 16:55               ` Bruce Richardson
2024-03-03 16:05 ` [PATCH] " Stephen Hemminger
2024-04-05 12:46 ` [PATCH v3] " Morten Brørup
2024-04-05 13:17   ` Bruce Richardson
2024-04-05 13:48 ` [PATCH v4] " Morten Brørup
2024-05-27 13:15 ` Morten Brørup
2024-05-27 13:16 ` [PATCH v5] " Morten Brørup
2024-05-27 14:13   ` Morten Brørup
2024-05-28  6:18 ` Morten Brørup
2024-05-28  6:22 ` [PATCH v6] " Morten Brørup
2024-05-28  7:05 ` [PATCH v7] " Morten Brørup
2024-05-30 15:41 ` [PATCH v8] " Morten Brørup
2024-06-10  9:05   ` Morten Brørup
2024-06-10 13:40   ` Konstantin Ananyev
2024-06-10 13:59     ` Morten Brørup

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).