* [PATCH] eal/x86: improve rte_memcpy const size 16 performance @ 2024-03-02 23:48 Morten Brørup 2024-03-03 0:38 ` Morten Brørup ` (13 more replies) 0 siblings, 14 replies; 40+ messages in thread From: Morten Brørup @ 2024-03-02 23:48 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is knownto be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> --- lib/eal/x86/include/rte_memcpy.h | 224 ++++++++----------------------- 1 file changed, 54 insertions(+), 170 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..6cc0e8ee16 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright(c) 2010-2014 Intel Corporation + * Copyright(c) 2024 SmartShare Systems */ #ifndef _RTE_MEMCPY_X86_64_H_ @@ -91,14 +92,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +112,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || defined __AVX__ __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +#define ALIGNMENT_MASK 0x3F + +/** + * AVX512 implementation below + */ + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -321,73 +340,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * AVX2 implementation below */ -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} - -/** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} - /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -437,15 +389,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -519,85 +470,6 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) * SSE & AVX implementation below */ -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - -/** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} - /** * Macro for copying unaligned block from one location to another with constant load offset, * 47 bytes leftover maximum, @@ -710,20 +582,26 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); +#if defined __AVX__ + rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - 32 + n); +#else /* SSE implementation */ + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); +#endif return ret; } if (n <= 128) { @@ -828,8 +706,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup @ 2024-03-03 0:38 ` Morten Brørup 2024-03-03 5:40 ` Stephen Hemminger ` (12 subsequent siblings) 13 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-03-03 0:38 UTC (permalink / raw) To: dev Recheck-request: iol-broadcom-Performance Patch only modifies x86 code, but fails performance on aarch64. ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup 2024-03-03 0:38 ` Morten Brørup @ 2024-03-03 5:40 ` Stephen Hemminger 2024-03-03 5:47 ` Stephen Hemminger 2024-03-03 5:58 ` Stephen Hemminger 2024-03-03 5:41 ` Stephen Hemminger ` (11 subsequent siblings) 13 siblings, 2 replies; 40+ messages in thread From: Stephen Hemminger @ 2024-03-03 5:40 UTC (permalink / raw) To: Morten Brørup Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev On Sun, 3 Mar 2024 00:48:12 +0100 Morten Brørup <mb@smartsharesystems.com> wrote: > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is knownto be 16 at build tine, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > --- Looks good, let me see how it looks in goldbolt vs Gcc. One other issue is that for the non-constant case, rte_memcpy has an excessively large inline code footprint. That is one of the reasons Gcc doesn't always inline. For > 128 bytes, it really should be a function. ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-03 5:40 ` Stephen Hemminger @ 2024-03-03 5:47 ` Stephen Hemminger 2024-03-03 5:58 ` Stephen Hemminger 2024-03-03 5:58 ` Stephen Hemminger 1 sibling, 1 reply; 40+ messages in thread From: Stephen Hemminger @ 2024-03-03 5:47 UTC (permalink / raw) To: Morten Brørup Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev While doing some tests with -Wall and -Wextra with current code. Saw that it doesn't really always get inlined anyway. In file included from /usr/lib/gcc/x86_64-linux-gnu/13/include/immintrin.h:37, from /usr/lib/gcc/x86_64-linux-gnu/13/include/x86intrin.h:32, from ethcopy.c:4: /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h: In function ‘rte_memcpy_generic’: /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:553:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 553 | case 0x01: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x01); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:554:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 554 | case 0x02: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x02); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:555:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 555 | case 0x03: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x03); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:556:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 556 | case 0x04: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x04); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:557:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 557 | case 0x05: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x05); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:558:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 558 | case 0x06: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x06); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:559:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 559 | case 0x07: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x07); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:560:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 560 | case 0x08: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x08); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:561:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 561 | case 0x09: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x09); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:562:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 562 | case 0x0A: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0A); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:563:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 563 | case 0x0B: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0B); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:564:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 564 | case 0x0C: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0C); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:565:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 565 | case 0x0D: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0D); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:566:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 566 | case 0x0E: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0E); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:504:9: note: called from here 504 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:505:9: note: called from here 505 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:506:9: note: called from here 506 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:507:9: note: called from here 507 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:508:9: note: called from here 508 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:509:9: note: called from here 509 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:510:9: note: called from here 510 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:511:9: note: called from here 511 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:526:13: note: called from here 526 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ /usr/lib/gcc/x86_64-linux-gnu/13/include/tmmintrin.h:185:1: error: inlining failed in call to ‘always_inline’ ‘_mm_alignr_epi8’: target specific option mismatch 185 | _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) | ^~~~~~~~~~~~~~~ ethcopy.c:527:13: note: called from here 527 | _mm_storeu_si128((__m128i *)(void *)((uint8_t *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:567:16: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47_IMM’ 567 | case 0x0F: MOVEUNALIGNED_LEFT47_IMM(dst, src, n, 0x0F); break; \ | ^~~~~~~~~~~~~~~~~~~~~~~~ ethcopy.c:693:9: note: in expansion of macro ‘MOVEUNALIGNED_LEFT47’ 693 | MOVEUNALIGNED_LEFT47(dst, src, n, srcofs); | ^~~~~~~~~~~~~~~~~~~~ Compilation exited abnormally with code 1 at Sat Mar 2 21:43:50 ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-03 5:47 ` Stephen Hemminger @ 2024-03-03 5:58 ` Stephen Hemminger 0 siblings, 0 replies; 40+ messages in thread From: Stephen Hemminger @ 2024-03-03 5:58 UTC (permalink / raw) To: Morten Brørup Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev On Sat, 2 Mar 2024 21:47:08 -0800 Stephen Hemminger <stephen@networkplumber.org> wrote: > While doing some tests with -Wall and -Wextra with current code. > Saw that it doesn't really always get inlined anyway. NVM needed -march=native ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-03 5:40 ` Stephen Hemminger 2024-03-03 5:47 ` Stephen Hemminger @ 2024-03-03 5:58 ` Stephen Hemminger 2024-03-03 10:07 ` Morten Brørup 1 sibling, 1 reply; 40+ messages in thread From: Stephen Hemminger @ 2024-03-03 5:58 UTC (permalink / raw) To: Morten Brørup Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev On Sat, 2 Mar 2024 21:40:03 -0800 Stephen Hemminger <stephen@networkplumber.org> wrote: > On Sun, 3 Mar 2024 00:48:12 +0100 > Morten Brørup <mb@smartsharesystems.com> wrote: > > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > > In the case where the size is knownto be 16 at build tine, omit the > > duplicate copy. > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > inside functions instead of outside functions. > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > --- > > Looks good, let me see how it looks in goldbolt vs Gcc. > > One other issue is that for the non-constant case, rte_memcpy has an excessively > large inline code footprint. That is one of the reasons Gcc doesn't always > inline. For > 128 bytes, it really should be a function. For size of 4,6,8,16, 32, 64, up to 128 Gcc inline and rte_memcpy match. For size 128. It looks gcc is simpler. rte_copy_addr: vmovdqu ymm0, YMMWORD PTR [rsi] vextracti128 XMMWORD PTR [rdi+16], ymm0, 0x1 vmovdqu XMMWORD PTR [rdi], xmm0 vmovdqu ymm0, YMMWORD PTR [rsi+32] vextracti128 XMMWORD PTR [rdi+48], ymm0, 0x1 vmovdqu XMMWORD PTR [rdi+32], xmm0 vmovdqu ymm0, YMMWORD PTR [rsi+64] vextracti128 XMMWORD PTR [rdi+80], ymm0, 0x1 vmovdqu XMMWORD PTR [rdi+64], xmm0 vmovdqu ymm0, YMMWORD PTR [rsi+96] vextracti128 XMMWORD PTR [rdi+112], ymm0, 0x1 vmovdqu XMMWORD PTR [rdi+96], xmm0 vzeroupper ret copy_addr: vmovdqu ymm0, YMMWORD PTR [rsi] vmovdqu YMMWORD PTR [rdi], ymm0 vmovdqu ymm1, YMMWORD PTR [rsi+32] vmovdqu YMMWORD PTR [rdi+32], ymm1 vmovdqu ymm2, YMMWORD PTR [rsi+64] vmovdqu YMMWORD PTR [rdi+64], ymm2 vmovdqu ymm3, YMMWORD PTR [rsi+96] vmovdqu YMMWORD PTR [rdi+96], ymm3 vzeroupper ret ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-03 5:58 ` Stephen Hemminger @ 2024-03-03 10:07 ` Morten Brørup 0 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-03-03 10:07 UTC (permalink / raw) To: Stephen Hemminger Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev > From: Stephen Hemminger [mailto:stephen@networkplumber.org] > Sent: Sunday, 3 March 2024 06.58 > > On Sat, 2 Mar 2024 21:40:03 -0800 > Stephen Hemminger <stephen@networkplumber.org> wrote: > > > On Sun, 3 Mar 2024 00:48:12 +0100 > > Morten Brørup <mb@smartsharesystems.com> wrote: > > > > > When the rte_memcpy() size is 16, the same 16 bytes are copied > twice. > > > In the case where the size is knownto be 16 at build tine, omit the > > > duplicate copy. > > > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > > inside functions instead of outside functions. > > > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > > --- > > > > Looks good, let me see how it looks in goldbolt vs Gcc. > > > > One other issue is that for the non-constant case, rte_memcpy has an > excessively > > large inline code footprint. That is one of the reasons Gcc doesn't > always > > inline. For > 128 bytes, it really should be a function. Yes, the code footprint is significant for the non-constant case. I suppose Intel considered the cost and benefits when they developed this. Or perhaps they just wanted a showcase for their new and shiny vector instructions. ;-) Inlining might provide significant branch prediction benefits in cases where the size is not build-time constant, but run-time constant. > > For size of 4,6,8,16, 32, 64, up to 128 Gcc inline and rte_memcpy match. > > For size 128. It looks gcc is simpler. > > rte_copy_addr: > vmovdqu ymm0, YMMWORD PTR [rsi] > vextracti128 XMMWORD PTR [rdi+16], ymm0, 0x1 > vmovdqu XMMWORD PTR [rdi], xmm0 > vmovdqu ymm0, YMMWORD PTR [rsi+32] > vextracti128 XMMWORD PTR [rdi+48], ymm0, 0x1 > vmovdqu XMMWORD PTR [rdi+32], xmm0 > vmovdqu ymm0, YMMWORD PTR [rsi+64] > vextracti128 XMMWORD PTR [rdi+80], ymm0, 0x1 > vmovdqu XMMWORD PTR [rdi+64], xmm0 > vmovdqu ymm0, YMMWORD PTR [rsi+96] > vextracti128 XMMWORD PTR [rdi+112], ymm0, 0x1 > vmovdqu XMMWORD PTR [rdi+96], xmm0 > vzeroupper > ret Interesting. Playing around with Godbolt revealed that GCC version < 11 creates the above from rte_memcpy, whereas GCC version >= 11 does it correctly. Clang doesn't have this issue. I guess that's why the original code treated AVX as SSE. Fixed in v2. > copy_addr: > vmovdqu ymm0, YMMWORD PTR [rsi] > vmovdqu YMMWORD PTR [rdi], ymm0 > vmovdqu ymm1, YMMWORD PTR [rsi+32] > vmovdqu YMMWORD PTR [rdi+32], ymm1 > vmovdqu ymm2, YMMWORD PTR [rsi+64] > vmovdqu YMMWORD PTR [rdi+64], ymm2 > vmovdqu ymm3, YMMWORD PTR [rsi+96] > vmovdqu YMMWORD PTR [rdi+96], ymm3 > vzeroupper > ret ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup 2024-03-03 0:38 ` Morten Brørup 2024-03-03 5:40 ` Stephen Hemminger @ 2024-03-03 5:41 ` Stephen Hemminger 2024-03-03 9:46 ` [PATCH v2] " Morten Brørup ` (10 subsequent siblings) 13 siblings, 0 replies; 40+ messages in thread From: Stephen Hemminger @ 2024-03-03 5:41 UTC (permalink / raw) To: Morten Brørup Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev On Sun, 3 Mar 2024 00:48:12 +0100 Morten Brørup <mb@smartsharesystems.com> wrote: > diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h > index 72a92290e0..6cc0e8ee16 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -1,5 +1,6 @@ > /* SPDX-License-Identifier: BSD-3-Clause > * Copyright(c) 2010-2014 Intel Corporation > + * Copyright(c) 2024 SmartShare Systems > */ Lets not start the precedent of adding individual copyrights on patches. ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (2 preceding siblings ...) 2024-03-03 5:41 ` Stephen Hemminger @ 2024-03-03 9:46 ` Morten Brørup 2024-04-04 9:18 ` Morten Brørup 2024-04-04 10:07 ` Bruce Richardson 2024-03-03 16:05 ` [PATCH] " Stephen Hemminger ` (9 subsequent siblings) 13 siblings, 2 replies; 40+ messages in thread From: Morten Brørup @ 2024-03-03 9:46 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> --- v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 231 ++++++++----------------------- 1 file changed, 56 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..d1df841f5e 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || \ + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))) __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +332,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F +#elif defined __AVX2__ || \ + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))) /** - * AVX2 implementation below + * AVX2 (and AVX, unless too old GCC version) implementation below */ -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} - -/** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +390,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +465,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * SSE (and AVX, with too old GCC version) implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +585,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +699,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +714,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-03-03 9:46 ` [PATCH v2] " Morten Brørup @ 2024-04-04 9:18 ` Morten Brørup 2024-04-04 10:07 ` Bruce Richardson 1 sibling, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-04-04 9:18 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev; +Cc: mattias.ronnblom, dev, stephen PING Intel x86 maintainers for review. > From: Morten Brørup [mailto:mb@smartsharesystems.com] > Sent: Sunday, 3 March 2024 10.46 > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build tine, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > --- > v2: > * For GCC, version 11 is required for proper AVX handling; > if older GCC version, treat AVX as SSE. > Clang does not have this issue. > Note: Original code always treated AVX as SSE, regardless of compiler. > * Do not add copyright. (Stephen Hemminger) > --- > lib/eal/x86/include/rte_memcpy.h | 231 ++++++++----------------------- > 1 file changed, 56 insertions(+), 175 deletions(-) > > diff --git a/lib/eal/x86/include/rte_memcpy.h > b/lib/eal/x86/include/rte_memcpy.h > index 72a92290e0..d1df841f5e 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) > return ret; > } > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > - > -#define ALIGNMENT_MASK 0x3F > - > -/** > - * AVX512 implementation below > - */ > - > /** > * Copy 16 bytes from one location to another, > * locations should not overlap. > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov32(uint8_t *dst, const uint8_t *src) > { > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || > \ > + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > < 110000))) > __m256i ymm0; > > ymm0 = _mm256_loadu_si256((const __m256i *)src); > _mm256_storeu_si256((__m256i *)dst, ymm0); > +#else /* SSE implementation */ > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > +#endif > } > > /** > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov64(uint8_t *dst, const uint8_t *src) > { > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > __m512i zmm0; > > zmm0 = _mm512_loadu_si512((const void *)src); > _mm512_storeu_si512((void *)dst, zmm0); > +#else /* AVX2, AVX & SSE implementation */ > + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > +#endif > } > > /** > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov256(uint8_t *dst, const uint8_t *src) > { > - rte_mov64(dst + 0 * 64, src + 0 * 64); > - rte_mov64(dst + 1 * 64, src + 1 * 64); > - rte_mov64(dst + 2 * 64, src + 2 * 64); > - rte_mov64(dst + 3 * 64, src + 3 * 64); > + rte_mov128(dst + 0 * 128, src + 0 * 128); > + rte_mov128(dst + 1 * 128, src + 1 * 128); > } > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > + > +/** > + * AVX512 implementation below > + */ > + > +#define ALIGNMENT_MASK 0x3F > + > /** > * Copy 128-byte blocks from one location to another, > * locations should not overlap. > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > /** > * Fast way when copy size doesn't exceed 512 bytes > */ > + if (__builtin_constant_p(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > + if (__builtin_constant_p(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > return ret; > } > + if (__builtin_constant_p(n) && n == 64) { > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > rte_mov32((uint8_t *)dst - 32 + n, > @@ -313,80 +332,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > goto COPY_BLOCK_128_BACK63; > } > > -#elif defined __AVX2__ > - > -#define ALIGNMENT_MASK 0x1F > +#elif defined __AVX2__ || \ > + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > < 110000))) > > /** > - * AVX2 implementation below > + * AVX2 (and AVX, unless too old GCC version) implementation below > */ > > -/** > - * Copy 16 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov16(uint8_t *dst, const uint8_t *src) > -{ > - __m128i xmm0; > - > - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); > - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); > -} > - > -/** > - * Copy 32 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov32(uint8_t *dst, const uint8_t *src) > -{ > - __m256i ymm0; > - > - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); > - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); > -} > - > -/** > - * Copy 64 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov64(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > -} > - > -/** > - * Copy 128 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov128(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > -} > - > -/** > - * Copy 256 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov256(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); > - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); > - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); > - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); > -} > +#define ALIGNMENT_MASK 0x1F > > /** > * Copy 128-byte blocks from one location to another, > @@ -437,15 +390,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > /** > * Fast way when copy size doesn't exceed 256 bytes > */ > - if (n <= 32) { > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, > - (const uint8_t *)src - 16 + n); > + if (__builtin_constant_p(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > return ret; > } > - if (n <= 48) { > + if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); > + if (__builtin_constant_p(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > return ret; > @@ -513,90 +465,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > > #else /* __AVX512F__ */ > > -#define ALIGNMENT_MASK 0x0F > - > -/** > - * SSE & AVX implementation below > - */ > - > -/** > - * Copy 16 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov16(uint8_t *dst, const uint8_t *src) > -{ > - __m128i xmm0; > - > - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); > - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); > -} > - > -/** > - * Copy 32 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov32(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > -} > - > /** > - * Copy 64 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov64(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > -} > - > -/** > - * Copy 128 bytes from one location to another, > - * locations should not overlap. > + * SSE (and AVX, with too old GCC version) implementation below > */ > -static __rte_always_inline void > -rte_mov128(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > -} > > -/** > - * Copy 256 bytes from one location to another, > - * locations should not overlap. > - */ > -static inline void > -rte_mov256(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); > - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); > - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); > - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); > - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); > - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); > - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); > - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); > -} > +#define ALIGNMENT_MASK 0x0F > > /** > * Macro for copying unaligned block from one location to another with > constant load offset, > @@ -712,17 +585,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > */ > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > - return ret; > - } > - if (n <= 48) { > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + if (__builtin_constant_p(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > return ret; > } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); > + if (n > 48) > + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); > rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > return ret; > } > @@ -828,8 +699,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) > } > > /* Copy 16 <= size <= 32 bytes */ > + if (__builtin_constant_p(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > + if (__builtin_constant_p(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > > @@ -837,6 +714,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) > } > > /* Copy 32 < size <= 64 bytes */ > + if (__builtin_constant_p(n) && n == 64) { > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > rte_mov32((uint8_t *)dst - 32 + n, > -- > 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-03-03 9:46 ` [PATCH v2] " Morten Brørup 2024-04-04 9:18 ` Morten Brørup @ 2024-04-04 10:07 ` Bruce Richardson 2024-04-04 11:19 ` Morten Brørup 1 sibling, 1 reply; 40+ messages in thread From: Bruce Richardson @ 2024-04-04 10:07 UTC (permalink / raw) To: Morten Brørup; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote: > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build tine, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Changes in general look good to me. Comments inline below. /Bruce > --- > v2: > * For GCC, version 11 is required for proper AVX handling; > if older GCC version, treat AVX as SSE. > Clang does not have this issue. > Note: Original code always treated AVX as SSE, regardless of compiler. > * Do not add copyright. (Stephen Hemminger) > --- > lib/eal/x86/include/rte_memcpy.h | 231 ++++++++----------------------- > 1 file changed, 56 insertions(+), 175 deletions(-) > > diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h > index 72a92290e0..d1df841f5e 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) > return ret; > } > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > - > -#define ALIGNMENT_MASK 0x3F > - > -/** > - * AVX512 implementation below > - */ > - > /** > * Copy 16 bytes from one location to another, > * locations should not overlap. > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov32(uint8_t *dst, const uint8_t *src) > { > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ || \ > + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))) I think we can drop the AVX512 checks here, since I'm not aware of any system where we'd have AVX512 but not AVX2 available, so just checking for AVX2 support should be sufficient. On the final compiler-based check, I don't strongly object to it, but I just wonder as to its real value. AVX2 was first introduced by Intel over 10 years ago, and (from what I find in wikipedia), it's been in AMD CPUs since ~2015. While we did have CPUs still being produced without AVX2 since that time, they generally didn't have AVX1 either, only having SSE instructions. Therefore the number of systems which require this additional check is likely very small at this stage. That said, I'm ok to either keep or omit it at your choice. If you do keep it, how about putting the check once at the top of the file and using a single short define instead for the multiple places it's used e.g. #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000))) #define RTE_MEMCPY_AVX2 #endif > __m256i ymm0; > > ymm0 = _mm256_loadu_si256((const __m256i *)src); > _mm256_storeu_si256((__m256i *)dst, ymm0); > +#else /* SSE implementation */ > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > +#endif > } > > /** > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov64(uint8_t *dst, const uint8_t *src) > { > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > __m512i zmm0; > > zmm0 = _mm512_loadu_si512((const void *)src); > _mm512_storeu_si512((void *)dst, zmm0); > +#else /* AVX2, AVX & SSE implementation */ > + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > +#endif > } > > /** > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov256(uint8_t *dst, const uint8_t *src) > { > - rte_mov64(dst + 0 * 64, src + 0 * 64); > - rte_mov64(dst + 1 * 64, src + 1 * 64); > - rte_mov64(dst + 2 * 64, src + 2 * 64); > - rte_mov64(dst + 3 * 64, src + 3 * 64); > + rte_mov128(dst + 0 * 128, src + 0 * 128); > + rte_mov128(dst + 1 * 128, src + 1 * 128); > } > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > + > +/** > + * AVX512 implementation below > + */ > + > +#define ALIGNMENT_MASK 0x3F > + > /** > * Copy 128-byte blocks from one location to another, > * locations should not overlap. > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > /** > * Fast way when copy size doesn't exceed 512 bytes > */ > + if (__builtin_constant_p(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } There's an outstanding patchset from Stephen to replace all use of rte_memcpy with a constant parameter with an actual call to regular memcpy. On a wider scale should we not look to do something similar in this file, have calls to rte_memcpy with constant parameter always turn into a call to regular memcpy? We used to have such a macro in older DPDK e.g. from DPDK 1.8 http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?h=v1.8.0#n171 This would elminiate the need to put in constant_p checks all through the code. > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > + if (__builtin_constant_p(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > return ret; > } <snip> ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-04-04 10:07 ` Bruce Richardson @ 2024-04-04 11:19 ` Morten Brørup 2024-04-04 13:29 ` Bruce Richardson 0 siblings, 1 reply; 40+ messages in thread From: Morten Brørup @ 2024-04-04 11:19 UTC (permalink / raw) To: Bruce Richardson; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev > From: Bruce Richardson [mailto:bruce.richardson@intel.com] > Sent: Thursday, 4 April 2024 12.07 > > On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote: > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > > In the case where the size is known to be 16 at build tine, omit the > > duplicate copy. > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > inside functions instead of outside functions. > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > Changes in general look good to me. Comments inline below. > > /Bruce > > > --- > > v2: > > * For GCC, version 11 is required for proper AVX handling; > > if older GCC version, treat AVX as SSE. > > Clang does not have this issue. > > Note: Original code always treated AVX as SSE, regardless of compiler. > > * Do not add copyright. (Stephen Hemminger) > > --- > > lib/eal/x86/include/rte_memcpy.h | 231 ++++++++----------------------- > > 1 file changed, 56 insertions(+), 175 deletions(-) > > > > diff --git a/lib/eal/x86/include/rte_memcpy.h > b/lib/eal/x86/include/rte_memcpy.h > > index 72a92290e0..d1df841f5e 100644 > > --- a/lib/eal/x86/include/rte_memcpy.h > > +++ b/lib/eal/x86/include/rte_memcpy.h > > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) > > return ret; > > } > > > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > - > > -#define ALIGNMENT_MASK 0x3F > > - > > -/** > > - * AVX512 implementation below > > - */ > > - > > /** > > * Copy 16 bytes from one location to another, > > * locations should not overlap. > > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > > static __rte_always_inline void > > rte_mov32(uint8_t *dst, const uint8_t *src) > > { > > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ > || \ > > + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > < 110000))) > > I think we can drop the AVX512 checks here, since I'm not aware of any > system where we'd have AVX512 but not AVX2 available, so just checking for > AVX2 support should be sufficient. RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512: https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memcpy.h#L98 Without it, the AVX2 version will be used, regardless if the CPU has AVX512. Also, there are some binutils bugs that might disable compilation for AVX512: https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4 https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17 > > On the final compiler-based check, I don't strongly object to it, but I > just wonder as to its real value. AVX2 was first introduced by Intel over 10 > years ago, and (from what I find in wikipedia), it's been in AMD CPUs since > ~2015. While we did have CPUs still being produced without AVX2 since that > time, they generally didn't have AVX1 either, only having SSE instructions. > Therefore the number of systems which require this additional check is > likely very small at this stage. > That said, I'm ok to either keep or omit it at your choice. I kept it for consistency, and to support older compilers still officially supported by DPDK. I don't feel qualified to change support for CPU features; I'll leave that to the CPU vendors. Also, I have no clue what has been produced by Intel and AMD. :-) > If you do keep > it, how about putting the check once at the top of the file and using a > single short define instead for the multiple places it's used e.g. > > #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < > 110000))) > #define RTE_MEMCPY_AVX2 > #endif Much of the code reorganization in this patch was done with the intention to improve readability. And I don't think this suggestion improves readability; especially considering that RTE_MEMCPY_AVX512 is something manually defined. However, I get your point; and if the conditional was very long or very complex, I might agree to a "shadow" definition to keep it short. > > > > __m256i ymm0; > > > > ymm0 = _mm256_loadu_si256((const __m256i *)src); > > _mm256_storeu_si256((__m256i *)dst, ymm0); > > +#else /* SSE implementation */ > > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > > +#endif > > } > > > > /** > > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > > static __rte_always_inline void > > rte_mov64(uint8_t *dst, const uint8_t *src) > > { > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > __m512i zmm0; > > > > zmm0 = _mm512_loadu_si512((const void *)src); > > _mm512_storeu_si512((void *)dst, zmm0); > > +#else /* AVX2, AVX & SSE implementation */ > > + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > > + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > > +#endif > > } > > > > /** > > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > > static __rte_always_inline void > > rte_mov256(uint8_t *dst, const uint8_t *src) > > { > > - rte_mov64(dst + 0 * 64, src + 0 * 64); > > - rte_mov64(dst + 1 * 64, src + 1 * 64); > > - rte_mov64(dst + 2 * 64, src + 2 * 64); > > - rte_mov64(dst + 3 * 64, src + 3 * 64); > > + rte_mov128(dst + 0 * 128, src + 0 * 128); > > + rte_mov128(dst + 1 * 128, src + 1 * 128); > > } > > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > + > > +/** > > + * AVX512 implementation below > > + */ > > + > > +#define ALIGNMENT_MASK 0x3F > > + > > /** > > * Copy 128-byte blocks from one location to another, > > * locations should not overlap. > > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t > n) > > /** > > * Fast way when copy size doesn't exceed 512 bytes > > */ > > + if (__builtin_constant_p(n) && n == 32) { > > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > + return ret; > > + } > > There's an outstanding patchset from Stephen to replace all use of > rte_memcpy with a constant parameter with an actual call to regular memcpy. > On a wider scale should we not look to do something similar in this file, > have calls to rte_memcpy with constant parameter always turn into a call to > regular memcpy? We used to have such a macro in older DPDK e.g. > from DPDK 1.8 > > http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp > y.h?h=v1.8.0#n171 > > This would elminiate the need to put in constant_p checks all through the > code. The old macro in DPDK 1.8 was removed with the description "Remove slow glibc call for constant copies": https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475 Stephen believes that the memcpy() built-ins provided by compilers are faster than rte_memcpy() for constant size. I'm not convinced. Such a change should be backed up by performance tests, preferably for all supported compilers - especially the old compilers that come with some of the supported distros might not be as good as we would hope. ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-04-04 11:19 ` Morten Brørup @ 2024-04-04 13:29 ` Bruce Richardson 2024-04-04 15:37 ` Morten Brørup 0 siblings, 1 reply; 40+ messages in thread From: Bruce Richardson @ 2024-04-04 13:29 UTC (permalink / raw) To: Morten Brørup; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev On Thu, Apr 04, 2024 at 01:19:54PM +0200, Morten Brørup wrote: > > From: Bruce Richardson [mailto:bruce.richardson@intel.com] > > Sent: Thursday, 4 April 2024 12.07 > > > > On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote: > > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > > > In the case where the size is known to be 16 at build tine, omit the > > > duplicate copy. > > > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > > inside functions instead of outside functions. > > > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > > > Changes in general look good to me. Comments inline below. > > > > /Bruce > > > > > --- > > > v2: > > > * For GCC, version 11 is required for proper AVX handling; > > > if older GCC version, treat AVX as SSE. > > > Clang does not have this issue. > > > Note: Original code always treated AVX as SSE, regardless of compiler. > > > * Do not add copyright. (Stephen Hemminger) > > > --- > > > lib/eal/x86/include/rte_memcpy.h | 231 ++++++++----------------------- > > > 1 file changed, 56 insertions(+), 175 deletions(-) > > > > > > diff --git a/lib/eal/x86/include/rte_memcpy.h > > b/lib/eal/x86/include/rte_memcpy.h > > > index 72a92290e0..d1df841f5e 100644 > > > --- a/lib/eal/x86/include/rte_memcpy.h > > > +++ b/lib/eal/x86/include/rte_memcpy.h > > > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) > > > return ret; > > > } > > > > > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > > - > > > -#define ALIGNMENT_MASK 0x3F > > > - > > > -/** > > > - * AVX512 implementation below > > > - */ > > > - > > > /** > > > * Copy 16 bytes from one location to another, > > > * locations should not overlap. > > > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > > > static __rte_always_inline void > > > rte_mov32(uint8_t *dst, const uint8_t *src) > > > { > > > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined __AVX2__ > > || \ > > > + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > > < 110000))) > > > > I think we can drop the AVX512 checks here, since I'm not aware of any > > system where we'd have AVX512 but not AVX2 available, so just checking for > > AVX2 support should be sufficient. > > RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512: > https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memcpy.h#L98 > > Without it, the AVX2 version will be used, regardless if the CPU has AVX512. > > Also, there are some binutils bugs that might disable compilation for AVX512: > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4 > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17 > Yes, I realise that, but the guard here is for an AVX2 block only, so there is no point in checking for AVX512 - it's AVX512 or AVX2. > > > > On the final compiler-based check, I don't strongly object to it, but I > > just wonder as to its real value. AVX2 was first introduced by Intel over 10 > > years ago, and (from what I find in wikipedia), it's been in AMD CPUs since > > ~2015. While we did have CPUs still being produced without AVX2 since that > > time, they generally didn't have AVX1 either, only having SSE instructions. > > Therefore the number of systems which require this additional check is > > likely very small at this stage. > > That said, I'm ok to either keep or omit it at your choice. > > I kept it for consistency, and to support older compilers still officially supported by DPDK. > > I don't feel qualified to change support for CPU features; I'll leave that to the CPU vendors. > Also, I have no clue what has been produced by Intel and AMD. :-) > > > If you do keep > > it, how about putting the check once at the top of the file and using a > > single short define instead for the multiple places it's used e.g. > > > > #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < > > 110000))) > > #define RTE_MEMCPY_AVX2 > > #endif > > Much of the code reorganization in this patch was done with the intention to improve readability. > > And I don't think this suggestion improves readability; especially considering that RTE_MEMCPY_AVX512 is something manually defined. > > However, I get your point; and if the conditional was very long or very complex, I might agree to a "shadow" definition to keep it short. > I just find it long enough that duplication of it seems painful. :-) I'd rather we check once at the top if we can use an AVX copy vs SSE, rather than duplicate the compiler version checks multiple times. > > > > > > > __m256i ymm0; > > > > > > ymm0 = _mm256_loadu_si256((const __m256i *)src); > > > _mm256_storeu_si256((__m256i *)dst, ymm0); > > > +#else /* SSE implementation */ > > > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > > > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > > > +#endif > > > } > > > > > > /** > > > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > > > static __rte_always_inline void > > > rte_mov64(uint8_t *dst, const uint8_t *src) > > > { > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > > __m512i zmm0; > > > > > > zmm0 = _mm512_loadu_si512((const void *)src); > > > _mm512_storeu_si512((void *)dst, zmm0); > > > +#else /* AVX2, AVX & SSE implementation */ > > > + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > > > + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > > > +#endif > > > } > > > > > > /** > > > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > > > static __rte_always_inline void > > > rte_mov256(uint8_t *dst, const uint8_t *src) > > > { > > > - rte_mov64(dst + 0 * 64, src + 0 * 64); > > > - rte_mov64(dst + 1 * 64, src + 1 * 64); > > > - rte_mov64(dst + 2 * 64, src + 2 * 64); > > > - rte_mov64(dst + 3 * 64, src + 3 * 64); > > > + rte_mov128(dst + 0 * 128, src + 0 * 128); > > > + rte_mov128(dst + 1 * 128, src + 1 * 128); > > > } > > > > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > > + > > > +/** > > > + * AVX512 implementation below > > > + */ > > > + > > > +#define ALIGNMENT_MASK 0x3F > > > + > > > /** > > > * Copy 128-byte blocks from one location to another, > > > * locations should not overlap. > > > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t > > n) > > > /** > > > * Fast way when copy size doesn't exceed 512 bytes > > > */ > > > + if (__builtin_constant_p(n) && n == 32) { > > > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > > + return ret; > > > + } > > > > There's an outstanding patchset from Stephen to replace all use of > > rte_memcpy with a constant parameter with an actual call to regular memcpy. > > On a wider scale should we not look to do something similar in this file, > > have calls to rte_memcpy with constant parameter always turn into a call to > > regular memcpy? We used to have such a macro in older DPDK e.g. > > from DPDK 1.8 > > > > http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp > > y.h?h=v1.8.0#n171 > > > > This would elminiate the need to put in constant_p checks all through the > > code. > > The old macro in DPDK 1.8 was removed with the description "Remove slow glibc call for constant copies": > https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_memcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475 > > Stephen believes that the memcpy() built-ins provided by compilers are faster than rte_memcpy() for constant size. > I'm not convinced. > Such a change should be backed up by performance tests, preferably for all supported compilers - especially the old compilers that come with some of the supported distros might not be as good as we would hope. > I would tend to agree with Stephen that whereever possible we should use the built-in memcpy calls. Hence my suggestion of re-introducing the macro. I'm not sure why it previously was seen as slower, it may be that the compiler-expanded memcpy calls are not done beyond a certain size. However, since we lack data, I'm ok with taking the changes in your patch as-is. With the above-flagged superfluous AVX512 check on AVX2 code removed: Acked-by: Bruce Richardson <bruce.richardson@intel.com> ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-04-04 13:29 ` Bruce Richardson @ 2024-04-04 15:37 ` Morten Brørup 2024-04-04 15:55 ` Stephen Hemminger 0 siblings, 1 reply; 40+ messages in thread From: Morten Brørup @ 2024-04-04 15:37 UTC (permalink / raw) To: Bruce Richardson; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev > From: Bruce Richardson [mailto:bruce.richardson@intel.com] > Sent: Thursday, 4 April 2024 15.29 > > On Thu, Apr 04, 2024 at 01:19:54PM +0200, Morten Brørup wrote: > > > From: Bruce Richardson [mailto:bruce.richardson@intel.com] > > > Sent: Thursday, 4 April 2024 12.07 > > > > > > On Sun, Mar 03, 2024 at 10:46:21AM +0100, Morten Brørup wrote: > > > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > > > > In the case where the size is known to be 16 at build tine, omit the > > > > duplicate copy. > > > > > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > > > inside functions instead of outside functions. > > > > > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > > > > > Changes in general look good to me. Comments inline below. > > > > > > /Bruce > > > > > > > --- > > > > v2: > > > > * For GCC, version 11 is required for proper AVX handling; > > > > if older GCC version, treat AVX as SSE. > > > > Clang does not have this issue. > > > > Note: Original code always treated AVX as SSE, regardless of compiler. > > > > * Do not add copyright. (Stephen Hemminger) > > > > --- > > > > lib/eal/x86/include/rte_memcpy.h | 231 ++++++++----------------------- > > > > 1 file changed, 56 insertions(+), 175 deletions(-) > > > > > > > > diff --git a/lib/eal/x86/include/rte_memcpy.h > > > b/lib/eal/x86/include/rte_memcpy.h > > > > index 72a92290e0..d1df841f5e 100644 > > > > --- a/lib/eal/x86/include/rte_memcpy.h > > > > +++ b/lib/eal/x86/include/rte_memcpy.h > > > > @@ -91,14 +91,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t > n) > > > > return ret; > > > > } > > > > > > > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > > > - > > > > -#define ALIGNMENT_MASK 0x3F > > > > - > > > > -/** > > > > - * AVX512 implementation below > > > > - */ > > > > - > > > > /** > > > > * Copy 16 bytes from one location to another, > > > > * locations should not overlap. > > > > @@ -119,10 +111,16 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > > > > static __rte_always_inline void > > > > rte_mov32(uint8_t *dst, const uint8_t *src) > > > > { > > > > +#if (defined __AVX512F__ && defined RTE_MEMCPY_AVX512) || defined > __AVX2__ > > > || \ > > > > + (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && > (GCC_VERSION > > > < 110000))) > > > > > > I think we can drop the AVX512 checks here, since I'm not aware of any > > > system where we'd have AVX512 but not AVX2 available, so just checking for > > > AVX2 support should be sufficient. > > > > RTE_MEMCPY_AVX512 must be manually defined at build time to enable AVX512: > > > https://elixir.bootlin.com/dpdk/latest/source/lib/eal/include/generic/rte_memc > py.h#L98 > > > > Without it, the AVX2 version will be used, regardless if the CPU has AVX512. > > > > Also, there are some binutils bugs that might disable compilation for > AVX512: > > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L4 > > https://elixir.bootlin.com/dpdk/latest/source/config/x86/meson.build#L17 > > > > Yes, I realise that, but the guard here is for an AVX2 block only, so there > is no point in checking for AVX512 - it's AVX512 or AVX2. Aha! Now I get your point: Checking for AVX2 suffices for AVX2 code. I didn't think of that when combining the copy-pasted code into one code block. Well spotted! Thank you. > > > > > > > On the final compiler-based check, I don't strongly object to it, but I > > > just wonder as to its real value. AVX2 was first introduced by Intel over > 10 > > > years ago, and (from what I find in wikipedia), it's been in AMD CPUs > since > > > ~2015. While we did have CPUs still being produced without AVX2 since that > > > time, they generally didn't have AVX1 either, only having SSE > instructions. > > > Therefore the number of systems which require this additional check is > > > likely very small at this stage. > > > That said, I'm ok to either keep or omit it at your choice. > > > > I kept it for consistency, and to support older compilers still officially > supported by DPDK. > > > > I don't feel qualified to change support for CPU features; I'll leave that > to the CPU vendors. > > Also, I have no clue what has been produced by Intel and AMD. :-) > > > > > If you do keep > > > it, how about putting the check once at the top of the file and using a > > > single short define instead for the multiple places it's used e.g. > > > > > > #if (defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < > > > 110000))) > > > #define RTE_MEMCPY_AVX2 > > > #endif > > > > Much of the code reorganization in this patch was done with the intention to > improve readability. > > > > And I don't think this suggestion improves readability; especially > considering that RTE_MEMCPY_AVX512 is something manually defined. > > > > However, I get your point; and if the conditional was very long or very > complex, I might agree to a "shadow" definition to keep it short. > > > > I just find it long enough that duplication of it seems painful. :-) I'd > rather we check once at the top if we can use an AVX copy vs SSE, rather > than duplicate the compiler version checks multiple times. OK. And I suppose the same principle as above applies: AVX2 implies AVX, so checking for AVX suffices. I suppose your suggested name RTE_MEMCPY_AVX2 was a typo, and will define it as RTE_MEMCPY_AVX. > > > > > > > > > > > > __m256i ymm0; > > > > > > > > ymm0 = _mm256_loadu_si256((const __m256i *)src); > > > > _mm256_storeu_si256((__m256i *)dst, ymm0); > > > > +#else /* SSE implementation */ > > > > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * > 16); > > > > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * > 16); > > > > +#endif > > > > } > > > > > > > > /** > > > > @@ -132,10 +130,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > > > > static __rte_always_inline void > > > > rte_mov64(uint8_t *dst, const uint8_t *src) > > > > { > > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > > > __m512i zmm0; > > > > > > > > zmm0 = _mm512_loadu_si512((const void *)src); > > > > _mm512_storeu_si512((void *)dst, zmm0); > > > > +#else /* AVX2, AVX & SSE implementation */ > > > > + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * > 32); > > > > + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * > 32); > > > > +#endif > > > > } > > > > > > > > /** > > > > @@ -156,12 +159,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > > > > static __rte_always_inline void > > > > rte_mov256(uint8_t *dst, const uint8_t *src) > > > > { > > > > - rte_mov64(dst + 0 * 64, src + 0 * 64); > > > > - rte_mov64(dst + 1 * 64, src + 1 * 64); > > > > - rte_mov64(dst + 2 * 64, src + 2 * 64); > > > > - rte_mov64(dst + 3 * 64, src + 3 * 64); > > > > + rte_mov128(dst + 0 * 128, src + 0 * 128); > > > > + rte_mov128(dst + 1 * 128, src + 1 * 128); > > > > } > > > > > > > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > > > > + > > > > +/** > > > > + * AVX512 implementation below > > > > + */ > > > > + > > > > +#define ALIGNMENT_MASK 0x3F > > > > + > > > > /** > > > > * Copy 128-byte blocks from one location to another, > > > > * locations should not overlap. > > > > @@ -231,12 +240,22 @@ rte_memcpy_generic(void *dst, const void *src, > size_t > > > n) > > > > /** > > > > * Fast way when copy size doesn't exceed 512 bytes > > > > */ > > > > + if (__builtin_constant_p(n) && n == 32) { > > > > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > > > > + return ret; > > > > + } > > > > > > There's an outstanding patchset from Stephen to replace all use of > > > rte_memcpy with a constant parameter with an actual call to regular > memcpy. > > > On a wider scale should we not look to do something similar in this file, > > > have calls to rte_memcpy with constant parameter always turn into a call > to > > > regular memcpy? We used to have such a macro in older DPDK e.g. > > > from DPDK 1.8 > > > > > > > http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcp > > > y.h?h=v1.8.0#n171 > > > > > > This would elminiate the need to put in constant_p checks all through the > > > code. > > > > The old macro in DPDK 1.8 was removed with the description "Remove slow > glibc call for constant copies": > > > https://git.dpdk.org/dpdk/commit/lib/librte_eal/common/include/arch/x86/rte_me > mcpy.h?id=9144d6bcdefd5096a9f3f89a3ce433a54ed84475 > > > > Stephen believes that the memcpy() built-ins provided by compilers are > faster than rte_memcpy() for constant size. > > I'm not convinced. > > Such a change should be backed up by performance tests, preferably for all > supported compilers - especially the old compilers that come with some of the > supported distros might not be as good as we would hope. > > > > I would tend to agree with Stephen that whereever possible we should use > the built-in memcpy calls. Hence my suggestion of re-introducing the macro. I agree in principle, but strongly prefer data to back up such changes in the fast path. > I'm not sure why it previously was seen as slower, it may be that the > compiler-expanded memcpy calls are not done beyond a certain size. > However, since we lack data, I'm ok with taking the changes in your patch > as-is. > > With the above-flagged superfluous AVX512 check on AVX2 code removed: > > Acked-by: Bruce Richardson <bruce.richardson@intel.com> Thanks. I'll provide a v3 patch. ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-04-04 15:37 ` Morten Brørup @ 2024-04-04 15:55 ` Stephen Hemminger 2024-04-04 16:10 ` Morten Brørup 0 siblings, 1 reply; 40+ messages in thread From: Stephen Hemminger @ 2024-04-04 15:55 UTC (permalink / raw) To: Morten Brørup Cc: Bruce Richardson, konstantin.v.ananyev, mattias.ronnblom, dev On Thu, 4 Apr 2024 17:37:53 +0200 Morten Brørup <mb@smartsharesystems.com> wrote: > > I would tend to agree with Stephen that whereever possible we should use > > the built-in memcpy calls. Hence my suggestion of re-introducing the macro. > > I agree in principle, but strongly prefer data to back up such changes in the fast path. godbolt.org shows same instructions for the cases I looked at. ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-04-04 15:55 ` Stephen Hemminger @ 2024-04-04 16:10 ` Morten Brørup 2024-04-04 16:55 ` Bruce Richardson 0 siblings, 1 reply; 40+ messages in thread From: Morten Brørup @ 2024-04-04 16:10 UTC (permalink / raw) To: Stephen Hemminger Cc: Bruce Richardson, konstantin.v.ananyev, mattias.ronnblom, dev > From: Stephen Hemminger [mailto:stephen@networkplumber.org] > Sent: Thursday, 4 April 2024 17.56 > > On Thu, 4 Apr 2024 17:37:53 +0200 > Morten Brørup <mb@smartsharesystems.com> wrote: > > > > I would tend to agree with Stephen that whereever possible we should use > > > the built-in memcpy calls. Hence my suggestion of re-introducing the > macro. > > > > I agree in principle, but strongly prefer data to back up such changes in > the fast path. > > > godbolt.org shows same instructions for the cases I looked at. Such a fundamental change belongs in a separate patch, with a description of what has been confirmed to generate same instructions or otherwise tested. On behalf of the distros, I'm mostly worried about older compilers. Anyway, this patch also tidies up the code, removing a lot of copy-paste, so I think we should go ahead with this patch first. ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v2] eal/x86: improve rte_memcpy const size 16 performance 2024-04-04 16:10 ` Morten Brørup @ 2024-04-04 16:55 ` Bruce Richardson 0 siblings, 0 replies; 40+ messages in thread From: Bruce Richardson @ 2024-04-04 16:55 UTC (permalink / raw) To: Morten Brørup Cc: Stephen Hemminger, konstantin.v.ananyev, mattias.ronnblom, dev On Thu, Apr 04, 2024 at 06:10:32PM +0200, Morten Brørup wrote: > > From: Stephen Hemminger [mailto:stephen@networkplumber.org] > > Sent: Thursday, 4 April 2024 17.56 > > > > On Thu, 4 Apr 2024 17:37:53 +0200 > > Morten Brørup <mb@smartsharesystems.com> wrote: > > > > > > I would tend to agree with Stephen that whereever possible we should use > > > > the built-in memcpy calls. Hence my suggestion of re-introducing the > > macro. > > > > > > I agree in principle, but strongly prefer data to back up such changes in > > the fast path. > > > > > > godbolt.org shows same instructions for the cases I looked at. > > Such a fundamental change belongs in a separate patch, with a description of what has been confirmed to generate same instructions or otherwise tested. > On behalf of the distros, I'm mostly worried about older compilers. > > Anyway, this patch also tidies up the code, removing a lot of copy-paste, so I think we should go ahead with this patch first. > I agree. Best to keep such changes in separate patches. ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (3 preceding siblings ...) 2024-03-03 9:46 ` [PATCH v2] " Morten Brørup @ 2024-03-03 16:05 ` Stephen Hemminger 2024-04-05 12:46 ` [PATCH v3] " Morten Brørup ` (8 subsequent siblings) 13 siblings, 0 replies; 40+ messages in thread From: Stephen Hemminger @ 2024-03-03 16:05 UTC (permalink / raw) To: Morten Brørup Cc: bruce.richardson, konstantin.v.ananyev, mattias.ronnblom, dev Another option would be to just do what PPC already does. The ENA part is because it has some garbage trying to use memcpy always (which is one of those bad ideas). From 74e7ab929e61e0481f6e0214d4d06a716b2f7d79 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger <stephen@networkplumber.org> Date: Sun, 3 Mar 2024 08:02:07 -0800 Subject: [PATCH] rte_memcpy: use builtin memcpy for fixed sizes This makes x86 arch do same thing as PPC, and also allows code checkers to see memcpy issues. It shows a pre-existing bug in ipsec test now. Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> --- drivers/net/ena/base/ena_plat_dpdk.h | 9 +----- lib/eal/x86/include/rte_memcpy.h | 45 +++++++++++++++------------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/drivers/net/ena/base/ena_plat_dpdk.h b/drivers/net/ena/base/ena_plat_dpdk.h index 14bf582a451f..997e6aa3dfbd 100644 --- a/drivers/net/ena/base/ena_plat_dpdk.h +++ b/drivers/net/ena/base/ena_plat_dpdk.h @@ -70,14 +70,7 @@ typedef uint64_t dma_addr_t; #define ENA_UDELAY(x) rte_delay_us_block(x) #define ENA_TOUCH(x) ((void)(x)) -/* Redefine memcpy with caution: rte_memcpy can be simply aliased to memcpy, so - * make the redefinition only if it's safe (and beneficial) to do so. - */ -#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64_MEMCPY) || \ - defined(RTE_ARCH_ARM_NEON_MEMCPY) -#undef memcpy -#define memcpy rte_memcpy -#endif + #define wmb rte_wmb #define rmb rte_rmb #define mb rte_mb diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e05d..aab30be0eeb9 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,24 +27,6 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif -/** - * Copy bytes from one location to another. The locations must not overlap. - * - * @note This is implemented as a macro, so it's address should not be taken - * and care is needed as parameter expressions may be evaluated multiple times. - * - * @param dst - * Pointer to the destination of the data. - * @param src - * Pointer to the source data. - * @param n - * Number of bytes to copy. - * @return - * Pointer to the destination data. - */ -static __rte_always_inline void * -rte_memcpy(void *dst, const void *src, size_t n); - /** * Copy bytes from one location to another, * locations should not overlap. @@ -859,8 +841,8 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) return ret; } -static __rte_always_inline void * -rte_memcpy(void *dst, const void *src, size_t n) +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) { if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK)) return rte_memcpy_aligned(dst, src, n); @@ -868,6 +850,29 @@ rte_memcpy(void *dst, const void *src, size_t n) return rte_memcpy_generic(dst, src, n); } + +/** + * Copy bytes from one location to another. The locations must not overlap. + * + * @note This is implemented as a macro, so it's address should not be taken + * and care is needed as parameter expressions may be evaluated multiple times. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + * @param n + * Number of bytes to copy. + * @return + * Pointer to the destination data. + */ +#define rte_memcpy(dst, src, n) \ + __extension__ ({ \ + (__builtin_constant_p(n)) ? \ + memcpy((dst), (src), (n)) : \ + rte_memcpy_func((dst), (src), (n)); }) + + #undef ALIGNMENT_MASK #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) -- 2.43.0 ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v3] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (4 preceding siblings ...) 2024-03-03 16:05 ` [PATCH] " Stephen Hemminger @ 2024-04-05 12:46 ` Morten Brørup 2024-04-05 13:17 ` Bruce Richardson 2024-04-05 13:48 ` [PATCH v4] " Morten Brørup ` (7 subsequent siblings) 13 siblings, 1 reply; 40+ messages in thread From: Morten Brørup @ 2024-04-05 12:46 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 234 ++++++++----------------------- 1 file changed, 59 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..b56bc46713 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,11 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* GCC prior to version 11 doesn't compile AVX properly, so use SSE instead. */ +#if defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +96,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +116,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +134,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +163,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +244,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +336,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +393,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +468,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - /** - * SSE & AVX implementation below + * SSE implementation below */ -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - -/** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +588,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +702,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +717,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v3] eal/x86: improve rte_memcpy const size 16 performance 2024-04-05 12:46 ` [PATCH v3] " Morten Brørup @ 2024-04-05 13:17 ` Bruce Richardson 0 siblings, 0 replies; 40+ messages in thread From: Bruce Richardson @ 2024-04-05 13:17 UTC (permalink / raw) To: Morten Brørup; +Cc: konstantin.v.ananyev, stephen, mattias.ronnblom, dev On Fri, Apr 05, 2024 at 02:46:28PM +0200, Morten Brørup wrote: > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build tine, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > --- > v3: > * AVX2 is a superset of AVX; > for a block of AVX code, testing for AVX suffices. (Bruce Richardson) > * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the > check for older GCC version. (Bruce Richardson) > v2: > * For GCC, version 11 is required for proper AVX handling; > if older GCC version, treat AVX as SSE. > Clang does not have this issue. > Note: Original code always treated AVX as SSE, regardless of compiler. > * Do not add copyright. (Stephen Hemminger) > --- > lib/eal/x86/include/rte_memcpy.h | 234 ++++++++----------------------- > 1 file changed, 59 insertions(+), 175 deletions(-) > > diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h > index 72a92290e0..b56bc46713 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -27,6 +27,11 @@ extern "C" { > #pragma GCC diagnostic ignored "-Wstringop-overflow" > #endif > > +/* GCC prior to version 11 doesn't compile AVX properly, so use SSE instead. */ > +#if defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) > +#define RTE_MEMCPY_AVX > +#endif > + Strictly speaking, to have the same behaviour as before, you need to check for AVX2 also, since the issue with GCC < 11 is for (AVX && !AVX2), i.e. if AVX2 is supported, all compilers are fine. My suggestion: #ifdef __AVX2__ #define RTE_MEMCPY_AVX #elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) #define RTE_MEMCPY_AVX #endif You can obviously merge the two branches if you want, but I find the split slightly easier to follow, than a mix of && and || with brackets for precedence. Final alternative I see, you can change defined(RTE_MEMCPY_AVX) to "defined(__AVX2__) || defined(RTE_MEMCPY_AVX)" each place it's used. /Bruce ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v4] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (5 preceding siblings ...) 2024-04-05 12:46 ` [PATCH v3] " Morten Brørup @ 2024-04-05 13:48 ` Morten Brørup 2024-05-27 13:15 ` Morten Brørup ` (6 subsequent siblings) 13 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-04-05 13:48 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- 1 file changed, 64 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..d687aa7756 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v4] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (6 preceding siblings ...) 2024-04-05 13:48 ` [PATCH v4] " Morten Brørup @ 2024-05-27 13:15 ` Morten Brørup 2024-05-27 13:16 ` [PATCH v5] " Morten Brørup ` (5 subsequent siblings) 13 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-05-27 13:15 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- 1 file changed, 64 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..d687aa7756 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__builtin_constant_p(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__builtin_constant_p(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__builtin_constant_p(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v5] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (7 preceding siblings ...) 2024-05-27 13:15 ` Morten Brørup @ 2024-05-27 13:16 ` Morten Brørup 2024-05-27 14:13 ` Morten Brørup 2024-05-28 6:18 ` Morten Brørup ` (4 subsequent siblings) 13 siblings, 1 reply; 40+ messages in thread From: Morten Brørup @ 2024-05-27 13:16 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p") Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- v5: * Fix for building with MSVC: Use __rte_constant() instead of __builtin_constant_p(). Add dependency on patch providing __rte_constant(). v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- 1 file changed, 64 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..1619a8f296 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v5] eal/x86: improve rte_memcpy const size 16 performance 2024-05-27 13:16 ` [PATCH v5] " Morten Brørup @ 2024-05-27 14:13 ` Morten Brørup 0 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-05-27 14:13 UTC (permalink / raw) To: dev Recheck-request: iol-testing ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v5] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (8 preceding siblings ...) 2024-05-27 13:16 ` [PATCH v5] " Morten Brørup @ 2024-05-28 6:18 ` Morten Brørup 2024-05-28 6:22 ` [PATCH v6] " Morten Brørup ` (3 subsequent siblings) 13 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-05-28 6:18 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p") Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- v6: * Don't wrap depends on line. It seems not to have been understood. v5: * Fix for building with MSVC: Use __rte_constant() instead of __builtin_constant_p(). Add dependency on patch providing __rte_constant(). v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- 1 file changed, 64 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..1619a8f296 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v6] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (9 preceding siblings ...) 2024-05-28 6:18 ` Morten Brørup @ 2024-05-28 6:22 ` Morten Brørup 2024-05-28 7:05 ` [PATCH v7] " Morten Brørup ` (2 subsequent siblings) 13 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-05-28 6:22 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Depends-on: series-31578 ("provide toolchain abstracted ...") Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- v6: * Don't wrap depends on line. It seems not to have been understood. v5: * Fix for building with MSVC: Use __rte_constant() instead of __builtin_constant_p(). Add dependency on patch providing __rte_constant(). v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- 1 file changed, 64 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..1619a8f296 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v7] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (10 preceding siblings ...) 2024-05-28 6:22 ` [PATCH v6] " Morten Brørup @ 2024-05-28 7:05 ` Morten Brørup 2024-05-30 15:41 ` [PATCH v8] " Morten Brørup 2024-07-09 13:27 ` [PATCH v9] " Morten Brørup 13 siblings, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-05-28 7:05 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build tine, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- Depends-on: patch-138647 ("eal: provide macro for GCC builtin constant intrinsic") v7: * Keep trying to fix that CI does not understand the dependency... Depend on patch instead of series. Move dependency out of the patch description itself, and down to the version log. v6: * Trying to fix CI not understanding dependency... Don't wrap dependency line. v5: * Fix for building with MSVC: Use __rte_constant() instead of __builtin_constant_p(). Add dependency on patch providing __rte_constant(). v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- 1 file changed, 64 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..1619a8f296 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (11 preceding siblings ...) 2024-05-28 7:05 ` [PATCH v7] " Morten Brørup @ 2024-05-30 15:41 ` Morten Brørup 2024-06-10 9:05 ` Morten Brørup 2024-06-10 13:40 ` Konstantin Ananyev 2024-07-09 13:27 ` [PATCH v9] " Morten Brørup 13 siblings, 2 replies; 40+ messages in thread From: Morten Brørup @ 2024-05-30 15:41 UTC (permalink / raw) To: bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, aconole, dev, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build time, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> --- Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p") v8: * Keep trying to fix that CI does not understand the dependency... Depend on series instead of patch. Github only understands series. * Fix typo in patch description. v7: * Keep trying to fix that CI does not understand the dependency... Depend on patch instead of series. Move dependency out of the patch description itself, and down to the version log. v6: * Trying to fix CI not understanding dependency... Don't wrap dependency line. v5: * Fix for building with MSVC: Use __rte_constant() instead of __builtin_constant_p(). Add dependency on patch providing __rte_constant(). v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- 1 file changed, 64 insertions(+), 175 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..1619a8f296 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F - -/** - * AVX2 implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} +#elif defined RTE_MEMCPY_AVX /** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. + * AVX implementation below */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} - /** - * Copy 64 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-05-30 15:41 ` [PATCH v8] " Morten Brørup @ 2024-06-10 9:05 ` Morten Brørup 2024-06-10 13:40 ` Konstantin Ananyev 1 sibling, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-06-10 9:05 UTC (permalink / raw) To: konstantin.v.ananyev, stephen Cc: mattias.ronnblom, roretzla, dev, bruce.richardson PING for review. The CI failures can be ignored: Most of the CI doesn't support the Depends-on tag, and this patch uses __rte_constant(), provided by Tyler's patch series [1]. [1]: https://inbox.dpdk.org/dev/1710970416-27841-1-git-send-email-roretzla@linux.microsoft.com/ -Morten > From: Morten Brørup [mailto:mb@smartsharesystems.com] > Sent: Thursday, 30 May 2024 17.41 > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build time, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > --- > Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p") > > v8: > * Keep trying to fix that CI does not understand the dependency... > Depend on series instead of patch. Github only understands series. > * Fix typo in patch description. > v7: > * Keep trying to fix that CI does not understand the dependency... > Depend on patch instead of series. > Move dependency out of the patch description itself, and down to the > version log. > v6: > * Trying to fix CI not understanding dependency... > Don't wrap dependency line. > v5: > * Fix for building with MSVC: > Use __rte_constant() instead of __builtin_constant_p(). > Add dependency on patch providing __rte_constant(). > v4: > * There are no problems compiling AVX2, only AVX. (Bruce Richardson) > v3: > * AVX2 is a superset of AVX; > for a block of AVX code, testing for AVX suffices. (Bruce Richardson) > * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the > check for older GCC version. (Bruce Richardson) > v2: > * For GCC, version 11 is required for proper AVX handling; > if older GCC version, treat AVX as SSE. > Clang does not have this issue. > Note: Original code always treated AVX as SSE, regardless of compiler. > * Do not add copyright. (Stephen Hemminger) > --- > lib/eal/x86/include/rte_memcpy.h | 239 +++++++++---------------------- > 1 file changed, 64 insertions(+), 175 deletions(-) > > diff --git a/lib/eal/x86/include/rte_memcpy.h > b/lib/eal/x86/include/rte_memcpy.h > index 72a92290e0..1619a8f296 100644 > --- a/lib/eal/x86/include/rte_memcpy.h > +++ b/lib/eal/x86/include/rte_memcpy.h > @@ -27,6 +27,16 @@ extern "C" { > #pragma GCC diagnostic ignored "-Wstringop-overflow" > #endif > > +/* > + * GCC older than version 11 doesn't compile AVX properly, so use SSE > instead. > + * There are no problems with AVX2. > + */ > +#if defined __AVX2__ > +#define RTE_MEMCPY_AVX > +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < > 110000)) > +#define RTE_MEMCPY_AVX > +#endif > + > /** > * Copy bytes from one location to another. The locations must not overlap. > * > @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) > return ret; > } > > -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > - > -#define ALIGNMENT_MASK 0x3F > - > -/** > - * AVX512 implementation below > - */ > - > /** > * Copy 16 bytes from one location to another, > * locations should not overlap. > @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov32(uint8_t *dst, const uint8_t *src) > { > +#if defined RTE_MEMCPY_AVX > __m256i ymm0; > > ymm0 = _mm256_loadu_si256((const __m256i *)src); > _mm256_storeu_si256((__m256i *)dst, ymm0); > +#else /* SSE implementation */ > + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > +#endif > } > > /** > @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov64(uint8_t *dst, const uint8_t *src) > { > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > __m512i zmm0; > > zmm0 = _mm512_loadu_si512((const void *)src); > _mm512_storeu_si512((void *)dst, zmm0); > +#else /* AVX2, AVX & SSE implementation */ > + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > +#endif > } > > /** > @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > static __rte_always_inline void > rte_mov256(uint8_t *dst, const uint8_t *src) > { > - rte_mov64(dst + 0 * 64, src + 0 * 64); > - rte_mov64(dst + 1 * 64, src + 1 * 64); > - rte_mov64(dst + 2 * 64, src + 2 * 64); > - rte_mov64(dst + 3 * 64, src + 3 * 64); > + rte_mov128(dst + 0 * 128, src + 0 * 128); > + rte_mov128(dst + 1 * 128, src + 1 * 128); > } > > +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 > + > +/** > + * AVX512 implementation below > + */ > + > +#define ALIGNMENT_MASK 0x3F > + > /** > * Copy 128-byte blocks from one location to another, > * locations should not overlap. > @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > /** > * Fast way when copy size doesn't exceed 512 bytes > */ > + if (__rte_constant(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > return ret; > } > + if (__rte_constant(n) && n == 64) { > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > rte_mov32((uint8_t *)dst - 32 + n, > @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > goto COPY_BLOCK_128_BACK63; > } > > -#elif defined __AVX2__ > - > -#define ALIGNMENT_MASK 0x1F > - > -/** > - * AVX2 implementation below > - */ > - > -/** > - * Copy 16 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov16(uint8_t *dst, const uint8_t *src) > -{ > - __m128i xmm0; > - > - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); > - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); > -} > - > -/** > - * Copy 32 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov32(uint8_t *dst, const uint8_t *src) > -{ > - __m256i ymm0; > - > - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); > - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); > -} > +#elif defined RTE_MEMCPY_AVX > > /** > - * Copy 64 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov64(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > -} > - > -/** > - * Copy 128 bytes from one location to another, > - * locations should not overlap. > + * AVX implementation below > */ > -static __rte_always_inline void > -rte_mov128(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > -} > > -/** > - * Copy 256 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov256(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); > - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); > - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); > - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); > -} > +#define ALIGNMENT_MASK 0x1F > > /** > * Copy 128-byte blocks from one location to another, > @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > /** > * Fast way when copy size doesn't exceed 256 bytes > */ > - if (n <= 32) { > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, > - (const uint8_t *)src - 16 + n); > + if (__rte_constant(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > return ret; > } > - if (n <= 48) { > + if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > return ret; > @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > > #else /* __AVX512F__ */ > > -#define ALIGNMENT_MASK 0x0F > - > -/** > - * SSE & AVX implementation below > - */ > - > -/** > - * Copy 16 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov16(uint8_t *dst, const uint8_t *src) > -{ > - __m128i xmm0; > - > - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); > - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); > -} > - > -/** > - * Copy 32 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov32(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > -} > - > /** > - * Copy 64 bytes from one location to another, > - * locations should not overlap. > + * SSE implementation below > */ > -static __rte_always_inline void > -rte_mov64(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > -} > > -/** > - * Copy 128 bytes from one location to another, > - * locations should not overlap. > - */ > -static __rte_always_inline void > -rte_mov128(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > -} > - > -/** > - * Copy 256 bytes from one location to another, > - * locations should not overlap. > - */ > -static inline void > -rte_mov256(uint8_t *dst, const uint8_t *src) > -{ > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); > - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); > - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); > - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); > - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); > - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); > - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); > - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); > -} > +#define ALIGNMENT_MASK 0x0F > > /** > * Macro for copying unaligned block from one location to another with > constant load offset, > @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) > */ > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > - return ret; > - } > - if (n <= 48) { > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > return ret; > } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); > + if (n > 48) > + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); > rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + > n); > return ret; > } > @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) > } > > /* Copy 16 <= size <= 32 bytes */ > + if (__rte_constant(n) && n == 32) { > + rte_mov32((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 32) { > rte_mov16((uint8_t *)dst, (const uint8_t *)src); > + if (__rte_constant(n) && n == 16) > + return ret; /* avoid (harmless) duplicate copy */ > rte_mov16((uint8_t *)dst - 16 + n, > (const uint8_t *)src - 16 + n); > > @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) > } > > /* Copy 32 < size <= 64 bytes */ > + if (__rte_constant(n) && n == 64) { > + rte_mov64((uint8_t *)dst, (const uint8_t *)src); > + return ret; > + } > if (n <= 64) { > rte_mov32((uint8_t *)dst, (const uint8_t *)src); > rte_mov32((uint8_t *)dst - 32 + n, > -- > 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-05-30 15:41 ` [PATCH v8] " Morten Brørup 2024-06-10 9:05 ` Morten Brørup @ 2024-06-10 13:40 ` Konstantin Ananyev 2024-06-10 13:59 ` Morten Brørup 2024-07-09 9:24 ` David Marchand 1 sibling, 2 replies; 40+ messages in thread From: Konstantin Ananyev @ 2024-06-10 13:40 UTC (permalink / raw) To: Morten Brørup, bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, aconole, dev > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build time, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > --- > Depends-on: series-31578 ("provide toolchain abstracted __builtin_constant_p") > > v8: > * Keep trying to fix that CI does not understand the dependency... > Depend on series instead of patch. Github only understands series. > * Fix typo in patch description. > v7: > * Keep trying to fix that CI does not understand the dependency... > Depend on patch instead of series. > Move dependency out of the patch description itself, and down to the > version log. > v6: > * Trying to fix CI not understanding dependency... > Don't wrap dependency line. > v5: > * Fix for building with MSVC: > Use __rte_constant() instead of __builtin_constant_p(). > Add dependency on patch providing __rte_constant(). > v4: > * There are no problems compiling AVX2, only AVX. (Bruce Richardson) > v3: > * AVX2 is a superset of AVX; > for a block of AVX code, testing for AVX suffices. (Bruce Richardson) > * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the > check for older GCC version. (Bruce Richardson) > v2: > * For GCC, version 11 is required for proper AVX handling; > if older GCC version, treat AVX as SSE. > Clang does not have this issue. > Note: Original code always treated AVX as SSE, regardless of compiler. > * Do not add copyright. (Stephen Hemminger) Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> The code change itself - LGTM. Out of interest - do you expect any perf diff with these changes? On my box I didn’t see any with 'memcpy_perf_autotest'. Konstantin ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-06-10 13:40 ` Konstantin Ananyev @ 2024-06-10 13:59 ` Morten Brørup 2024-07-09 9:24 ` David Marchand 1 sibling, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-06-10 13:59 UTC (permalink / raw) To: Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla Cc: mattias.ronnblom, aconole, dev > From: Konstantin Ananyev [mailto:konstantin.ananyev@huawei.com] > Sent: Monday, 10 June 2024 15.40 > > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > > In the case where the size is known to be 16 at build time, omit the > > duplicate copy. > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > inside functions instead of outside functions. > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > > --- > > Depends-on: series-31578 ("provide toolchain abstracted > __builtin_constant_p") > > > > v8: > > * Keep trying to fix that CI does not understand the dependency... > > Depend on series instead of patch. Github only understands series. > > * Fix typo in patch description. > > v7: > > * Keep trying to fix that CI does not understand the dependency... > > Depend on patch instead of series. > > Move dependency out of the patch description itself, and down to the > > version log. > > v6: > > * Trying to fix CI not understanding dependency... > > Don't wrap dependency line. > > v5: > > * Fix for building with MSVC: > > Use __rte_constant() instead of __builtin_constant_p(). > > Add dependency on patch providing __rte_constant(). > > v4: > > * There are no problems compiling AVX2, only AVX. (Bruce Richardson) > > v3: > > * AVX2 is a superset of AVX; > > for a block of AVX code, testing for AVX suffices. (Bruce Richardson) > > * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the > > check for older GCC version. (Bruce Richardson) > > v2: > > * For GCC, version 11 is required for proper AVX handling; > > if older GCC version, treat AVX as SSE. > > Clang does not have this issue. > > Note: Original code always treated AVX as SSE, regardless of compiler. > > * Do not add copyright. (Stephen Hemminger) > > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> > > The code change itself - LGTM. > Out of interest - do you expect any perf diff with these changes? I don't expect a significant perf diff with these changes, not even for the affected special cases. But the generated code (for the affected cases) is smaller. Stephen noticed that the code generated from rte_memcpy() was inefficient in some cases [1], so I decided to fix it. [1]: https://inbox.dpdk.org/dev/20240302090207.428d4853@hermes.local/ The code generated from rte_memcpy() was not incorrect, only slightly inefficient (for the affected cases), so the patch is not a bugfix in need of backporting. > On my box I didn’t see any with 'memcpy_perf_autotest'. > Konstantin > ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-06-10 13:40 ` Konstantin Ananyev 2024-06-10 13:59 ` Morten Brørup @ 2024-07-09 9:24 ` David Marchand 2024-07-09 11:42 ` David Marchand 1 sibling, 1 reply; 40+ messages in thread From: David Marchand @ 2024-07-09 9:24 UTC (permalink / raw) To: Morten Brørup Cc: Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom, aconole, dev On Mon, Jun 10, 2024 at 3:40 PM Konstantin Ananyev <konstantin.ananyev@huawei.com> wrote: > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > > In the case where the size is known to be 16 at build time, omit the > > duplicate copy. > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > inside functions instead of outside functions. > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> Applied, thanks for the cleanup. -- David Marchand ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-07-09 9:24 ` David Marchand @ 2024-07-09 11:42 ` David Marchand 2024-07-09 12:43 ` Morten Brørup 0 siblings, 1 reply; 40+ messages in thread From: David Marchand @ 2024-07-09 11:42 UTC (permalink / raw) To: Morten Brørup Cc: Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom, aconole, dev Hello, On Tue, Jul 9, 2024 at 11:24 AM David Marchand <david.marchand@redhat.com> wrote: > > On Mon, Jun 10, 2024 at 3:40 PM Konstantin Ananyev > <konstantin.ananyev@huawei.com> wrote: > > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > > > In the case where the size is known to be 16 at build time, omit the > > > duplicate copy. > > > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > > inside functions instead of outside functions. > > > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> > > Applied, thanks for the cleanup. This breaks OVS compilation (clang and gcc). make[1]: *** [Makefile:4722: lib/ofp-packet.lo] Error 1 make[1]: *** Waiting for unfinished jobs.... In file included from lib/ofp-print.c:34: In file included from ./lib/dp-packet.h:25: In file included from /home/runner/work/ovs/ovs/dpdk-dir/include/rte_mbuf.h:38: In file included from /home/runner/work/ovs/ovs/dpdk-dir/include/rte_mempool.h:50: /home/runner/work/ovs/ovs/dpdk-dir/include/rte_memcpy.h:113:25: error: cast from 'const uint8_t *' (aka 'const unsigned char *') to 'const __m128i *' increases required alignment from 1 to 16 [-Werror,-Wcast-align] xmm0 = _mm_loadu_si128((const __m128i *)src); ^~~~~~~~~~~~~~~~~~~~ /home/runner/work/ovs/ovs/dpdk-dir/include/rte_memcpy.h:114:19: error: cast from 'uint8_t *' (aka 'unsigned char *') to '__m128i *' increases required alignment from 1 to 16 [-Werror,-Wcast-align] _mm_storeu_si128((__m128i *)dst, xmm0); ^~~~~~~~~~~~~~ 2 errors generated. make[1]: *** [Makefile:4722: lib/ofp-print.lo] Error 1 make[1]: Leaving directory '/home/runner/work/ovs/ovs' make: *** [Makefile:3102: all] Error 2 I dropped this patch from main for now. Can you have a look please? -- David Marchand ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-07-09 11:42 ` David Marchand @ 2024-07-09 12:43 ` Morten Brørup 2024-07-09 12:47 ` David Marchand 0 siblings, 1 reply; 40+ messages in thread From: Morten Brørup @ 2024-07-09 12:43 UTC (permalink / raw) To: David Marchand Cc: Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom, aconole, dev > From: David Marchand [mailto:david.marchand@redhat.com] > Sent: Tuesday, 9 July 2024 13.43 > > Hello, > > On Tue, Jul 9, 2024 at 11:24 AM David Marchand > <david.marchand@redhat.com> wrote: > > > > On Mon, Jun 10, 2024 at 3:40 PM Konstantin Ananyev > > <konstantin.ananyev@huawei.com> wrote: > > > > When the rte_memcpy() size is 16, the same 16 bytes are copied > twice. > > > > In the case where the size is known to be 16 at build time, omit > the > > > > duplicate copy. > > > > > > > > Reduced the amount of effectively copy-pasted code by using #ifdef > > > > inside functions instead of outside functions. > > > > > > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > > > > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > > > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> > > > > Applied, thanks for the cleanup. > > This breaks OVS compilation (clang and gcc). > > make[1]: *** [Makefile:4722: lib/ofp-packet.lo] Error 1 > make[1]: *** Waiting for unfinished jobs.... > In file included from lib/ofp-print.c:34: > In file included from ./lib/dp-packet.h:25: > In file included from /home/runner/work/ovs/ovs/dpdk- > dir/include/rte_mbuf.h:38: > In file included from > /home/runner/work/ovs/ovs/dpdk-dir/include/rte_mempool.h:50: > /home/runner/work/ovs/ovs/dpdk-dir/include/rte_memcpy.h:113:25: error: > cast from 'const uint8_t *' (aka 'const unsigned char *') to 'const > __m128i *' increases required alignment from 1 to 16 > [-Werror,-Wcast-align] > xmm0 = _mm_loadu_si128((const __m128i *)src); > ^~~~~~~~~~~~~~~~~~~~ > /home/runner/work/ovs/ovs/dpdk-dir/include/rte_memcpy.h:114:19: error: > cast from 'uint8_t *' (aka 'unsigned char *') to '__m128i *' increases > required alignment from 1 to 16 [-Werror,-Wcast-align] > _mm_storeu_si128((__m128i *)dst, xmm0); > ^~~~~~~~~~~~~~ > 2 errors generated. > make[1]: *** [Makefile:4722: lib/ofp-print.lo] Error 1 > make[1]: Leaving directory '/home/runner/work/ovs/ovs' > make: *** [Makefile:3102: all] Error 2 > > I dropped this patch from main for now. > Can you have a look please? It seems the new code casts directly to the vector register size, while the original code first cast to void*, and then to the register size. I'll try to fix it and post a new version. PS: The CI should catch this stuff. ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-07-09 12:43 ` Morten Brørup @ 2024-07-09 12:47 ` David Marchand 2024-07-09 12:54 ` Morten Brørup 2024-07-09 15:26 ` Patrick Robb 0 siblings, 2 replies; 40+ messages in thread From: David Marchand @ 2024-07-09 12:47 UTC (permalink / raw) To: Morten Brørup, Patrick Robb Cc: Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom, aconole, dev On Tue, Jul 9, 2024 at 2:43 PM Morten Brørup <mb@smartsharesystems.com> wrote: > PS: The CI should catch this stuff. Restoring OVS tests in CI has been requested and I think it was being worked on. Not sure where we are atm, Patrick? -- David Marchand ^ permalink raw reply [flat|nested] 40+ messages in thread
* RE: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-07-09 12:47 ` David Marchand @ 2024-07-09 12:54 ` Morten Brørup 2024-07-09 15:26 ` Patrick Robb 1 sibling, 0 replies; 40+ messages in thread From: Morten Brørup @ 2024-07-09 12:54 UTC (permalink / raw) To: David Marchand, Patrick Robb Cc: Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom, aconole, dev > From: David Marchand [mailto:david.marchand@redhat.com] > Sent: Tuesday, 9 July 2024 14.48 > > On Tue, Jul 9, 2024 at 2:43 PM Morten Brørup <mb@smartsharesystems.com> > wrote: > > PS: The CI should catch this stuff. Working on fixing it now, some old variants of rte_mov16() do the extra cast, and some don't. It could be CPU feature (SSE/AVX/AVX512) specific. Also, the header file's definition of _mm_loadu_si128() has the wrong parameter type - it is specified as a type that must be aligned, although it is not required. The intrinsic header files are full of bugs like this. > > Restoring OVS tests in CI has been requested and I think it was being > worked on. > Not sure where we are atm, Patrick? > > > -- > David Marchand ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v8] eal/x86: improve rte_memcpy const size 16 performance 2024-07-09 12:47 ` David Marchand 2024-07-09 12:54 ` Morten Brørup @ 2024-07-09 15:26 ` Patrick Robb 1 sibling, 0 replies; 40+ messages in thread From: Patrick Robb @ 2024-07-09 15:26 UTC (permalink / raw) To: David Marchand Cc: Morten Brørup, Konstantin Ananyev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom, aconole, dev On Tue, Jul 9, 2024 at 8:48 AM David Marchand <david.marchand@redhat.com> wrote: > > On Tue, Jul 9, 2024 at 2:43 PM Morten Brørup <mb@smartsharesystems.com> wrote: > > PS: The CI should catch this stuff. > > Restoring OVS tests in CI has been requested and I think it was being worked on. > Not sure where we are atm, Patrick? > OvS and SPDK compile jobs were added about a month ago. So, Morten's series should be getting flagged for OVS fails. An example from a series which already has CI finished: https://mails.dpdk.org/archives/test-report/2024-July/728503.html ^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH v9] eal/x86: improve rte_memcpy const size 16 performance 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup ` (12 preceding siblings ...) 2024-05-30 15:41 ` [PATCH v8] " Morten Brørup @ 2024-07-09 13:27 ` Morten Brørup 2024-07-09 15:42 ` David Marchand 2024-07-10 8:03 ` David Marchand 13 siblings, 2 replies; 40+ messages in thread From: Morten Brørup @ 2024-07-09 13:27 UTC (permalink / raw) To: dev, david.marchand Cc: bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom, Morten Brørup When the rte_memcpy() size is 16, the same 16 bytes are copied twice. In the case where the size is known to be 16 at build time, omit the duplicate copy. Reduced the amount of effectively copy-pasted code by using #ifdef inside functions instead of outside functions. Suggested-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: Morten Brørup <mb@smartsharesystems.com> Acked-by: Bruce Richardson <bruce.richardson@intel.com> Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> --- v9: * Remove dependency on other patch; it has been applied already. * Add extra casts to fix warnings in OVS build. (David Marchand) v8: * Keep trying to fix that CI does not understand the dependency... Depend on series instead of patch. Github only understands series. * Fix typo in patch description. v7: * Keep trying to fix that CI does not understand the dependency... Depend on patch instead of series. Move dependency out of the patch description itself, and down to the version log. v6: * Trying to fix CI not understanding dependency... Don't wrap dependency line. v5: * Fix for building with MSVC: Use __rte_constant() instead of __builtin_constant_p(). Add dependency on patch providing __rte_constant(). v4: * There are no problems compiling AVX2, only AVX. (Bruce Richardson) v3: * AVX2 is a superset of AVX; for a block of AVX code, testing for AVX suffices. (Bruce Richardson) * Define RTE_MEMCPY_AVX if AVX is available, to avoid copy-pasting the check for older GCC version. (Bruce Richardson) v2: * For GCC, version 11 is required for proper AVX handling; if older GCC version, treat AVX as SSE. Clang does not have this issue. Note: Original code always treated AVX as SSE, regardless of compiler. * Do not add copyright. (Stephen Hemminger) --- lib/eal/x86/include/rte_memcpy.h | 247 +++++++++---------------------- 1 file changed, 68 insertions(+), 179 deletions(-) diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h index 72a92290e0..42058e4a3f 100644 --- a/lib/eal/x86/include/rte_memcpy.h +++ b/lib/eal/x86/include/rte_memcpy.h @@ -27,6 +27,16 @@ extern "C" { #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif +/* + * GCC older than version 11 doesn't compile AVX properly, so use SSE instead. + * There are no problems with AVX2. + */ +#if defined __AVX2__ +#define RTE_MEMCPY_AVX +#elif defined __AVX__ && !(defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 110000)) +#define RTE_MEMCPY_AVX +#endif + /** * Copy bytes from one location to another. The locations must not overlap. * @@ -91,14 +101,6 @@ rte_mov15_or_less(void *dst, const void *src, size_t n) return ret; } -#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 - -#define ALIGNMENT_MASK 0x3F - -/** - * AVX512 implementation below - */ - /** * Copy 16 bytes from one location to another, * locations should not overlap. @@ -108,8 +110,8 @@ rte_mov16(uint8_t *dst, const uint8_t *src) { __m128i xmm0; - xmm0 = _mm_loadu_si128((const __m128i *)src); - _mm_storeu_si128((__m128i *)dst, xmm0); + xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); + _mm_storeu_si128((__m128i *)(void *)dst, xmm0); } /** @@ -119,10 +121,15 @@ rte_mov16(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { +#if defined RTE_MEMCPY_AVX __m256i ymm0; - ymm0 = _mm256_loadu_si256((const __m256i *)src); - _mm256_storeu_si256((__m256i *)dst, ymm0); + ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); + _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); +#else /* SSE implementation */ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +#endif } /** @@ -132,10 +139,15 @@ rte_mov32(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov64(uint8_t *dst, const uint8_t *src) { +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 __m512i zmm0; zmm0 = _mm512_loadu_si512((const void *)src); _mm512_storeu_si512((void *)dst, zmm0); +#else /* AVX2, AVX & SSE implementation */ + rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); + rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +#endif } /** @@ -156,12 +168,18 @@ rte_mov128(uint8_t *dst, const uint8_t *src) static __rte_always_inline void rte_mov256(uint8_t *dst, const uint8_t *src) { - rte_mov64(dst + 0 * 64, src + 0 * 64); - rte_mov64(dst + 1 * 64, src + 1 * 64); - rte_mov64(dst + 2 * 64, src + 2 * 64); - rte_mov64(dst + 3 * 64, src + 3 * 64); + rte_mov128(dst + 0 * 128, src + 0 * 128); + rte_mov128(dst + 1 * 128, src + 1 * 128); } +#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512 + +/** + * AVX512 implementation below + */ + +#define ALIGNMENT_MASK 0x3F + /** * Copy 128-byte blocks from one location to another, * locations should not overlap. @@ -231,12 +249,22 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 512 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, @@ -313,80 +341,13 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) goto COPY_BLOCK_128_BACK63; } -#elif defined __AVX2__ - -#define ALIGNMENT_MASK 0x1F +#elif defined RTE_MEMCPY_AVX /** - * AVX2 implementation below + * AVX implementation below */ -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - -/** - * Copy 32 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - __m256i ymm0; - - ymm0 = _mm256_loadu_si256((const __m256i *)(const void *)src); - _mm256_storeu_si256((__m256i *)(void *)dst, ymm0); -} - -/** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); -} +#define ALIGNMENT_MASK 0x1F /** * Copy 128-byte blocks from one location to another, @@ -437,15 +398,14 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) /** * Fast way when copy size doesn't exceed 256 bytes */ - if (n <= 32) { - rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, - (const uint8_t *)src - 16 + n); + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); return ret; } - if (n <= 48) { + if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 16, (const uint8_t *)src + 16); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; @@ -513,90 +473,11 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) #else /* __AVX512F__ */ -#define ALIGNMENT_MASK 0x0F - -/** - * SSE & AVX implementation below - */ - -/** - * Copy 16 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov16(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm0; - - xmm0 = _mm_loadu_si128((const __m128i *)(const void *)src); - _mm_storeu_si128((__m128i *)(void *)dst, xmm0); -} - /** - * Copy 32 bytes from one location to another, - * locations should not overlap. + * SSE implementation below */ -static __rte_always_inline void -rte_mov32(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); -} -/** - * Copy 64 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov64(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); -} - -/** - * Copy 128 bytes from one location to another, - * locations should not overlap. - */ -static __rte_always_inline void -rte_mov128(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); -} - -/** - * Copy 256 bytes from one location to another, - * locations should not overlap. - */ -static inline void -rte_mov256(uint8_t *dst, const uint8_t *src) -{ - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * 16); - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * 16); - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * 16); - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * 16); - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * 16); - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * 16); -} +#define ALIGNMENT_MASK 0x0F /** * Macro for copying unaligned block from one location to another with constant load offset, @@ -712,17 +593,15 @@ rte_memcpy_generic(void *dst, const void *src, size_t n) */ if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); - return ret; - } - if (n <= 48) { - rte_mov32((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); + if (n > 48) + rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); return ret; } @@ -828,8 +707,14 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 16 <= size <= 32 bytes */ + if (__rte_constant(n) && n == 32) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 32) { rte_mov16((uint8_t *)dst, (const uint8_t *)src); + if (__rte_constant(n) && n == 16) + return ret; /* avoid (harmless) duplicate copy */ rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - 16 + n); @@ -837,6 +722,10 @@ rte_memcpy_aligned(void *dst, const void *src, size_t n) } /* Copy 32 < size <= 64 bytes */ + if (__rte_constant(n) && n == 64) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + return ret; + } if (n <= 64) { rte_mov32((uint8_t *)dst, (const uint8_t *)src); rte_mov32((uint8_t *)dst - 32 + n, -- 2.17.1 ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v9] eal/x86: improve rte_memcpy const size 16 performance 2024-07-09 13:27 ` [PATCH v9] " Morten Brørup @ 2024-07-09 15:42 ` David Marchand 2024-07-10 8:03 ` David Marchand 1 sibling, 0 replies; 40+ messages in thread From: David Marchand @ 2024-07-09 15:42 UTC (permalink / raw) To: Morten Brørup Cc: dev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom On Tue, Jul 9, 2024 at 3:28 PM Morten Brørup <mb@smartsharesystems.com> wrote: > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build time, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> > --- > v9: > * Remove dependency on other patch; it has been applied already. > * Add extra casts to fix warnings in OVS build. (David Marchand) I'll wait for the CI to finish. On my side, OVS compilation now passes (though we still have a regression in net/virtio-user + control q :-( ). v8: https://github.com/david-marchand/ovs/actions/runs/9858595974/job/27221724340#step:12:517 v9: https://github.com/david-marchand/ovs/actions/runs/9858595974/job/27220442547 -- David Marchand ^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH v9] eal/x86: improve rte_memcpy const size 16 performance 2024-07-09 13:27 ` [PATCH v9] " Morten Brørup 2024-07-09 15:42 ` David Marchand @ 2024-07-10 8:03 ` David Marchand 1 sibling, 0 replies; 40+ messages in thread From: David Marchand @ 2024-07-10 8:03 UTC (permalink / raw) To: Morten Brørup Cc: dev, bruce.richardson, konstantin.v.ananyev, stephen, roretzla, mattias.ronnblom On Tue, Jul 9, 2024 at 3:28 PM Morten Brørup <mb@smartsharesystems.com> wrote: > > When the rte_memcpy() size is 16, the same 16 bytes are copied twice. > In the case where the size is known to be 16 at build time, omit the > duplicate copy. > > Reduced the amount of effectively copy-pasted code by using #ifdef > inside functions instead of outside functions. > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com> > Acked-by: Bruce Richardson <bruce.richardson@intel.com> > Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com> CI looks good (with OVS compilation tested). Applied, thanks. -- David Marchand ^ permalink raw reply [flat|nested] 40+ messages in thread
end of thread, other threads:[~2024-07-10 8:03 UTC | newest] Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-03-02 23:48 [PATCH] eal/x86: improve rte_memcpy const size 16 performance Morten Brørup 2024-03-03 0:38 ` Morten Brørup 2024-03-03 5:40 ` Stephen Hemminger 2024-03-03 5:47 ` Stephen Hemminger 2024-03-03 5:58 ` Stephen Hemminger 2024-03-03 5:58 ` Stephen Hemminger 2024-03-03 10:07 ` Morten Brørup 2024-03-03 5:41 ` Stephen Hemminger 2024-03-03 9:46 ` [PATCH v2] " Morten Brørup 2024-04-04 9:18 ` Morten Brørup 2024-04-04 10:07 ` Bruce Richardson 2024-04-04 11:19 ` Morten Brørup 2024-04-04 13:29 ` Bruce Richardson 2024-04-04 15:37 ` Morten Brørup 2024-04-04 15:55 ` Stephen Hemminger 2024-04-04 16:10 ` Morten Brørup 2024-04-04 16:55 ` Bruce Richardson 2024-03-03 16:05 ` [PATCH] " Stephen Hemminger 2024-04-05 12:46 ` [PATCH v3] " Morten Brørup 2024-04-05 13:17 ` Bruce Richardson 2024-04-05 13:48 ` [PATCH v4] " Morten Brørup 2024-05-27 13:15 ` Morten Brørup 2024-05-27 13:16 ` [PATCH v5] " Morten Brørup 2024-05-27 14:13 ` Morten Brørup 2024-05-28 6:18 ` Morten Brørup 2024-05-28 6:22 ` [PATCH v6] " Morten Brørup 2024-05-28 7:05 ` [PATCH v7] " Morten Brørup 2024-05-30 15:41 ` [PATCH v8] " Morten Brørup 2024-06-10 9:05 ` Morten Brørup 2024-06-10 13:40 ` Konstantin Ananyev 2024-06-10 13:59 ` Morten Brørup 2024-07-09 9:24 ` David Marchand 2024-07-09 11:42 ` David Marchand 2024-07-09 12:43 ` Morten Brørup 2024-07-09 12:47 ` David Marchand 2024-07-09 12:54 ` Morten Brørup 2024-07-09 15:26 ` Patrick Robb 2024-07-09 13:27 ` [PATCH v9] " Morten Brørup 2024-07-09 15:42 ` David Marchand 2024-07-10 8:03 ` David Marchand
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).