From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-oi0-f54.google.com (mail-oi0-f54.google.com [209.85.218.54]) by dpdk.org (Postfix) with ESMTP id 7024FB38D for ; Sat, 9 May 2015 01:10:53 +0200 (CEST) Received: by oift201 with SMTP id t201so69834890oif.3 for ; Fri, 08 May 2015 16:10:52 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:in-reply-to:references:date:message-id:subject:from:to :cc:content-type; bh=2UFBfJZ2g1ZEEIBL2E/RnvA2puGHrtMmwbIkDtmIt9k=; b=s0UopScPTzaT64dKuGauoIWGhA/PySwh/GKGJW4rWqOc/1yQ/cz659ZC5kqgcAl+ow xyQQvOlsJyiB4O8fawdb9yFrgLzXMdEOWyZXrs08V9TEZ1Xbd1UU2UDgIWyIuO2Ntq+H 8jPFIzjQBykZ8wXfmiW3rJjg1XMUJ8kO8Z1hqul+GQWmRqrKC11+4GkmBw9QM9VcgAMF hMPuzR/MtqWeTsVnTtWf5dvofLd7HBMid58JGxiST7RBHhij+tan3K+Hd7vocpgCltMT QT/56RJG/qr27pTBRgIuDNTq488uiaIxYsg4UQB7+TH1EMLnnif51LH07y1ulRF8/P45 ghsQ== MIME-Version: 1.0 X-Received: by 10.202.54.3 with SMTP id d3mr218730oia.103.1431126652886; Fri, 08 May 2015 16:10:52 -0700 (PDT) Received: by 10.202.179.195 with HTTP; Fri, 8 May 2015 16:10:52 -0700 (PDT) In-Reply-To: <1429562009-11817-1-git-send-email-rkerur@gmail.com> References: <1429561948-11777-1-git-send-email-rkerur@gmail.com> <1429562009-11817-1-git-send-email-rkerur@gmail.com> Date: Fri, 8 May 2015 16:10:52 -0700 Message-ID: From: Ravi Kerur To: "dev@dpdk.org" Content-Type: text/plain; charset=ISO-8859-1 X-Content-Filtered-By: Mailman/MimeDel 2.1.15 Subject: Re: [dpdk-dev] [PATCH v2] Clean up rte_memcpy.h file X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 08 May 2015 23:10:54 -0000 Any inputs here? No functionality change just cleanup. I have run "make test" and "memcpy_perf_autotest". I have not noticed any changes in numbers. On Mon, Apr 20, 2015 at 1:33 PM, Ravi Kerur wrote: > Remove unnecessary type casting in functions. > > Tested on Ubuntu (14.04 x86_64) with "make test". > "make test" results match the results with baseline. > "Memcpy perf" results match the results with baseline. > > Signed-off-by: Ravi Kerur > --- > .../common/include/arch/x86/rte_memcpy.h | 340 > +++++++++++---------- > 1 file changed, 175 insertions(+), 165 deletions(-) > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > index 6a57426..839d4ec 100644 > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > @@ -106,8 +106,8 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > static inline void > rte_mov64(uint8_t *dst, const uint8_t *src) > { > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > + rte_mov32(dst + 0 * 32, src + 0 * 32); > + rte_mov32(dst + 1 * 32, src + 1 * 32); > } > > /** > @@ -117,10 +117,10 @@ rte_mov64(uint8_t *dst, const uint8_t *src) > static inline void > rte_mov128(uint8_t *dst, const uint8_t *src) > { > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > + rte_mov32(dst + 0 * 32, src + 0 * 32); > + rte_mov32(dst + 1 * 32, src + 1 * 32); > + rte_mov32(dst + 2 * 32, src + 2 * 32); > + rte_mov32(dst + 3 * 32, src + 3 * 32); > } > > /** > @@ -130,14 +130,14 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > static inline void > rte_mov256(uint8_t *dst, const uint8_t *src) > { > - rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); > - rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); > - rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); > - rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); > - rte_mov32((uint8_t *)dst + 4 * 32, (const uint8_t *)src + 4 * 32); > - rte_mov32((uint8_t *)dst + 5 * 32, (const uint8_t *)src + 5 * 32); > - rte_mov32((uint8_t *)dst + 6 * 32, (const uint8_t *)src + 6 * 32); > - rte_mov32((uint8_t *)dst + 7 * 32, (const uint8_t *)src + 7 * 32); > + rte_mov32(dst + 0 * 32, src + 0 * 32); > + rte_mov32(dst + 1 * 32, src + 1 * 32); > + rte_mov32(dst + 2 * 32, src + 2 * 32); > + rte_mov32(dst + 3 * 32, src + 3 * 32); > + rte_mov32(dst + 4 * 32, src + 4 * 32); > + rte_mov32(dst + 5 * 32, src + 5 * 32); > + rte_mov32(dst + 6 * 32, src + 6 * 32); > + rte_mov32(dst + 7 * 32, src + 7 * 32); > } > > /** > @@ -150,13 +150,16 @@ rte_mov64blocks(uint8_t *dst, const uint8_t *src, > size_t n) > __m256i ymm0, ymm1; > > while (n >= 64) { > - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 0 * 32)); > + > + ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32)); > + ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32)); > + > + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0); > + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1); > + > n -= 64; > - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 1 * 32)); > - src = (const uint8_t *)src + 64; > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), > ymm0); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), > ymm1); > - dst = (uint8_t *)dst + 64; > + src = src + 64; > + dst = dst + 64; > } > } > > @@ -170,34 +173,39 @@ rte_mov256blocks(uint8_t *dst, const uint8_t *src, > size_t n) > __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7; > > while (n >= 256) { > - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 0 * 32)); > + > + ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32)); > + ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32)); > + ymm2 = _mm256_loadu_si256((const __m256i *)(src + 2 * 32)); > + ymm3 = _mm256_loadu_si256((const __m256i *)(src + 3 * 32)); > + ymm4 = _mm256_loadu_si256((const __m256i *)(src + 4 * 32)); > + ymm5 = _mm256_loadu_si256((const __m256i *)(src + 5 * 32)); > + ymm6 = _mm256_loadu_si256((const __m256i *)(src + 6 * 32)); > + ymm7 = _mm256_loadu_si256((const __m256i *)(src + 7 * 32)); > + > + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0); > + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1); > + _mm256_storeu_si256((__m256i *)(dst + 2 * 32), ymm2); > + _mm256_storeu_si256((__m256i *)(dst + 3 * 32), ymm3); > + _mm256_storeu_si256((__m256i *)(dst + 4 * 32), ymm4); > + _mm256_storeu_si256((__m256i *)(dst + 5 * 32), ymm5); > + _mm256_storeu_si256((__m256i *)(dst + 6 * 32), ymm6); > + _mm256_storeu_si256((__m256i *)(dst + 7 * 32), ymm7); > + > n -= 256; > - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 1 * 32)); > - ymm2 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 2 * 32)); > - ymm3 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 3 * 32)); > - ymm4 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 4 * 32)); > - ymm5 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 5 * 32)); > - ymm6 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 6 * 32)); > - ymm7 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 7 * 32)); > - src = (const uint8_t *)src + 256; > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), > ymm0); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), > ymm1); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 2 * 32), > ymm2); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 3 * 32), > ymm3); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 4 * 32), > ymm4); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 5 * 32), > ymm5); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 6 * 32), > ymm6); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 7 * 32), > ymm7); > - dst = (uint8_t *)dst + 256; > + src = src + 256; > + dst = dst + 256; > } > } > > static inline void * > -rte_memcpy(void *dst, const void *src, size_t n) > +rte_memcpy(void *_dst, const void *_src, size_t n) > { > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > - void *ret = dst; > + const uint8_t *src = (const uint8_t *)_src; > + uint8_t *dst = (uint8_t *)_dst; > + uintptr_t dstu = (uintptr_t)_dst; > + uintptr_t srcu = (uintptr_t)_src; > + void *ret = _dst; > size_t dstofss; > size_t bits; > > @@ -230,43 +238,43 @@ rte_memcpy(void *dst, const void *src, size_t n) > * Fast way when copy size doesn't exceed 512 bytes > */ > if (n <= 32) { > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - > 16 + n); > + rte_mov16(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > return ret; > } > if (n <= 64) { > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t *)src - > 32 + n); > + rte_mov32(dst, src); > + rte_mov32(dst - 32 + n, src - 32 + n); > return ret; > } > if (n <= 512) { > if (n >= 256) { > n -= 256; > - rte_mov256((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + 256; > - dst = (uint8_t *)dst + 256; > + rte_mov256(dst, src); > + src = src + 256; > + dst = dst + 256; > } > if (n >= 128) { > n -= 128; > - rte_mov128((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + 128; > - dst = (uint8_t *)dst + 128; > + rte_mov128(dst, src); > + src = src + 128; > + dst = dst + 128; > } > if (n >= 64) { > n -= 64; > - rte_mov64((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + 64; > - dst = (uint8_t *)dst + 64; > + rte_mov64(dst, src); > + src = src + 64; > + dst = dst + 64; > } > COPY_BLOCK_64_BACK31: > if (n > 32) { > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t > *)src - 32 + n); > + rte_mov32(dst, src); > + rte_mov32(dst - 32 + n, src - 32 + n); > return ret; > } > - if (n > 0) { > - rte_mov32((uint8_t *)dst - 32 + n, (const uint8_t > *)src - 32 + n); > - } > + if (n > 0) > + rte_mov32(dst - 32 + n, src - 32 + n); > + > return ret; > } > > @@ -275,21 +283,21 @@ COPY_BLOCK_64_BACK31: > */ > dstofss = 32 - ((uintptr_t)dst & 0x1F); > n -= dstofss; > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + dstofss; > - dst = (uint8_t *)dst + dstofss; > + rte_mov32(dst, src); > + src = src + dstofss; > + dst = dst + dstofss; > > /** > * Copy 256-byte blocks. > * Use copy block function for better instruction order control, > * which is important when load is unaligned. > */ > - rte_mov256blocks((uint8_t *)dst, (const uint8_t *)src, n); > + rte_mov256blocks(dst, src, n); > bits = n; > n = n & 255; > bits -= n; > - src = (const uint8_t *)src + bits; > - dst = (uint8_t *)dst + bits; > + src = src + bits; > + dst = dst + bits; > > /** > * Copy 64-byte blocks. > @@ -297,12 +305,12 @@ COPY_BLOCK_64_BACK31: > * which is important when load is unaligned. > */ > if (n >= 64) { > - rte_mov64blocks((uint8_t *)dst, (const uint8_t *)src, n); > + rte_mov64blocks(dst, src, n); > bits = n; > n = n & 63; > bits -= n; > - src = (const uint8_t *)src + bits; > - dst = (uint8_t *)dst + bits; > + src = src + bits; > + dst = dst + bits; > } > > /** > @@ -337,8 +345,8 @@ rte_mov16(uint8_t *dst, const uint8_t *src) > static inline void > rte_mov32(uint8_t *dst, const uint8_t *src) > { > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > + rte_mov16(dst + 0 * 16, src + 0 * 16); > + rte_mov16(dst + 1 * 16, src + 1 * 16); > } > > /** > @@ -348,10 +356,10 @@ rte_mov32(uint8_t *dst, const uint8_t *src) > static inline void > rte_mov64(uint8_t *dst, const uint8_t *src) > { > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > + rte_mov16(dst + 0 * 16, src + 0 * 16); > + rte_mov16(dst + 1 * 16, src + 1 * 16); > + rte_mov16(dst + 2 * 16, src + 2 * 16); > + rte_mov16(dst + 3 * 16, src + 3 * 16); > } > > /** > @@ -361,14 +369,14 @@ rte_mov64(uint8_t *dst, const uint8_t *src) > static inline void > rte_mov128(uint8_t *dst, const uint8_t *src) > { > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > + rte_mov16(dst + 0 * 16, src + 0 * 16); > + rte_mov16(dst + 1 * 16, src + 1 * 16); > + rte_mov16(dst + 2 * 16, src + 2 * 16); > + rte_mov16(dst + 3 * 16, src + 3 * 16); > + rte_mov16(dst + 4 * 16, src + 4 * 16); > + rte_mov16(dst + 5 * 16, src + 5 * 16); > + rte_mov16(dst + 6 * 16, src + 6 * 16); > + rte_mov16(dst + 7 * 16, src + 7 * 16); > } > > /** > @@ -378,22 +386,22 @@ rte_mov128(uint8_t *dst, const uint8_t *src) > static inline void > rte_mov256(uint8_t *dst, const uint8_t *src) > { > - rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); > - rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); > - rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); > - rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); > - rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); > - rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); > - rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); > - rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); > - rte_mov16((uint8_t *)dst + 8 * 16, (const uint8_t *)src + 8 * 16); > - rte_mov16((uint8_t *)dst + 9 * 16, (const uint8_t *)src + 9 * 16); > - rte_mov16((uint8_t *)dst + 10 * 16, (const uint8_t *)src + 10 * > 16); > - rte_mov16((uint8_t *)dst + 11 * 16, (const uint8_t *)src + 11 * > 16); > - rte_mov16((uint8_t *)dst + 12 * 16, (const uint8_t *)src + 12 * > 16); > - rte_mov16((uint8_t *)dst + 13 * 16, (const uint8_t *)src + 13 * > 16); > - rte_mov16((uint8_t *)dst + 14 * 16, (const uint8_t *)src + 14 * > 16); > - rte_mov16((uint8_t *)dst + 15 * 16, (const uint8_t *)src + 15 * > 16); > + rte_mov16(dst + 0 * 16, src + 0 * 16); > + rte_mov16(dst + 1 * 16, src + 1 * 16); > + rte_mov16(dst + 2 * 16, src + 2 * 16); > + rte_mov16(dst + 3 * 16, src + 3 * 16); > + rte_mov16(dst + 4 * 16, src + 4 * 16); > + rte_mov16(dst + 5 * 16, src + 5 * 16); > + rte_mov16(dst + 6 * 16, src + 6 * 16); > + rte_mov16(dst + 7 * 16, src + 7 * 16); > + rte_mov16(dst + 8 * 16, src + 8 * 16); > + rte_mov16(dst + 9 * 16, src + 9 * 16); > + rte_mov16(dst + 10 * 16, src + 10 * 16); > + rte_mov16(dst + 11 * 16, src + 11 * 16); > + rte_mov16(dst + 12 * 16, src + 12 * 16); > + rte_mov16(dst + 13 * 16, src + 13 * 16); > + rte_mov16(dst + 14 * 16, src + 14 * 16); > + rte_mov16(dst + 15 * 16, src + 15 * 16); > } > > /** > @@ -411,48 +419,48 @@ rte_mov256(uint8_t *dst, const uint8_t *src) > ({ > \ > int tmp; > \ > while (len >= 128 + 16 - offset) { > \ > - xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 0 * 16)); \ > - len -= 128; > \ > - xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 1 * 16)); \ > - xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 2 * 16)); \ > - xmm3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 3 * 16)); \ > - xmm4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 4 * 16)); \ > - xmm5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 5 * 16)); \ > - xmm6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 6 * 16)); \ > - xmm7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 7 * 16)); \ > - xmm8 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src - > offset + 8 * 16)); \ > + xmm0 = _mm_loadu_si128((const __m128i *)(src - offset + 0 * > 16)); \ > + len -= 128; > \ > + xmm1 = _mm_loadu_si128((const __m128i *)(src - offset + 1 * > 16)); \ > + xmm2 = _mm_loadu_si128((const __m128i *)(src - offset + 2 * > 16)); \ > + xmm3 = _mm_loadu_si128((const __m128i *)(src - offset + 3 * > 16)); \ > + xmm4 = _mm_loadu_si128((const __m128i *)(src - offset + 4 * > 16)); \ > + xmm5 = _mm_loadu_si128((const __m128i *)(src - offset + 5 * > 16)); \ > + xmm6 = _mm_loadu_si128((const __m128i *)(src - offset + 6 * > 16)); \ > + xmm7 = _mm_loadu_si128((const __m128i *)(src - offset + 7 * > 16)); \ > + xmm8 = _mm_loadu_si128((const __m128i *)(src - offset + 8 * > 16)); \ > src = (const uint8_t *)src + 128; > \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), > _mm_alignr_epi8(xmm1, xmm0, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), > _mm_alignr_epi8(xmm2, xmm1, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 2 * 16), > _mm_alignr_epi8(xmm3, xmm2, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 3 * 16), > _mm_alignr_epi8(xmm4, xmm3, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 4 * 16), > _mm_alignr_epi8(xmm5, xmm4, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 5 * 16), > _mm_alignr_epi8(xmm6, xmm5, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 6 * 16), > _mm_alignr_epi8(xmm7, xmm6, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 7 * 16), > _mm_alignr_epi8(xmm8, xmm7, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 0 * 16), _mm_alignr_epi8(xmm1, > xmm0, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 1 * 16), _mm_alignr_epi8(xmm2, > xmm1, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 2 * 16), _mm_alignr_epi8(xmm3, > xmm2, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 3 * 16), _mm_alignr_epi8(xmm4, > xmm3, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 4 * 16), _mm_alignr_epi8(xmm5, > xmm4, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 5 * 16), _mm_alignr_epi8(xmm6, > xmm5, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 6 * 16), _mm_alignr_epi8(xmm7, > xmm6, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 7 * 16), _mm_alignr_epi8(xmm8, > xmm7, offset)); \ > dst = (uint8_t *)dst + 128; > \ > } > \ > tmp = len; > \ > len = ((len - 16 + offset) & 127) + 16 - offset; > \ > tmp -= len; > \ > - src = (const uint8_t *)src + tmp; > \ > - dst = (uint8_t *)dst + tmp; > \ > + src = src + tmp; > \ > + dst = dst + tmp; > \ > if (len >= 32 + 16 - offset) { > \ > while (len >= 32 + 16 - offset) { > \ > - xmm0 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src > - offset + 0 * 16)); \ > len -= 32; > \ > - xmm1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src > - offset + 1 * 16)); \ > - xmm2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)src > - offset + 2 * 16)); \ > - src = (const uint8_t *)src + 32; > \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 0 * 16), > _mm_alignr_epi8(xmm1, xmm0, offset)); \ > - _mm_storeu_si128((__m128i *)((uint8_t *)dst + 1 * 16), > _mm_alignr_epi8(xmm2, xmm1, offset)); \ > - dst = (uint8_t *)dst + 32; > \ > + xmm0 = _mm_loadu_si128((const __m128i *)(src - offset + 0 * > 16)); \ > + xmm1 = _mm_loadu_si128((const __m128i *)(src - offset + 1 * > 16)); \ > + xmm2 = _mm_loadu_si128((const __m128i *)(src - offset + 2 * > 16)); \ > + src = src + 32; > \ > + _mm_storeu_si128((__m128i *)(dst + 0 * 16), > _mm_alignr_epi8(xmm1, xmm0, offset)); \ > + _mm_storeu_si128((__m128i *)(dst + 1 * 16), > _mm_alignr_epi8(xmm2, xmm1, offset)); \ > + dst = dst + 32; > \ > } > \ > tmp = len; > \ > len = ((len - 16 + offset) & 31) + 16 - offset; > \ > tmp -= len; > \ > - src = (const uint8_t *)src + tmp; > \ > - dst = (uint8_t *)dst + tmp; > \ > + src = src + tmp; > \ > + dst = dst + tmp; > \ > } > \ > }) > > @@ -491,12 +499,14 @@ rte_mov256(uint8_t *dst, const uint8_t *src) > }) > > static inline void * > -rte_memcpy(void *dst, const void *src, size_t n) > +rte_memcpy(void *_dst, const void *_src, size_t n) > { > __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; > - uintptr_t dstu = (uintptr_t)dst; > - uintptr_t srcu = (uintptr_t)src; > - void *ret = dst; > + const uint8_t *src = (const uint8_t *)_src; > + uint8_t *dst = (uint8_t *)_dst; > + uintptr_t dstu = (uintptr_t)_dst; > + uintptr_t srcu = (uintptr_t)_src; > + void *ret = _dst; > size_t dstofss; > size_t srcofs; > > @@ -529,61 +539,61 @@ rte_memcpy(void *dst, const void *src, size_t n) > * Fast way when copy size doesn't exceed 512 bytes > */ > if (n <= 32) { > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - > 16 + n); > + rte_mov16(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > return ret; > } > if (n <= 48) { > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - > 16 + n); > + rte_mov32(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > return ret; > } > if (n <= 64) { > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst + 32, (const uint8_t *)src + 32); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t *)src - > 16 + n); > + rte_mov32(dst, src); > + rte_mov16(dst + 32, src + 32); > + rte_mov16(dst - 16 + n, src - 16 + n); > return ret; > } > - if (n <= 128) { > + if (n <= 128) > goto COPY_BLOCK_128_BACK15; > - } > + > if (n <= 512) { > if (n >= 256) { > n -= 256; > - rte_mov128((uint8_t *)dst, (const uint8_t *)src); > - rte_mov128((uint8_t *)dst + 128, (const uint8_t > *)src + 128); > - src = (const uint8_t *)src + 256; > - dst = (uint8_t *)dst + 256; > + rte_mov128(dst, src); > + rte_mov128(dst + 128, src + 128); > + src = src + 256; > + dst = dst + 256; > } > COPY_BLOCK_255_BACK15: > if (n >= 128) { > n -= 128; > - rte_mov128((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + 128; > - dst = (uint8_t *)dst + 128; > + rte_mov128(dst, src); > + src = src + 128; > + dst = dst + 128; > } > COPY_BLOCK_128_BACK15: > if (n >= 64) { > n -= 64; > - rte_mov64((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + 64; > - dst = (uint8_t *)dst + 64; > + rte_mov64(dst, src); > + src = src + 64; > + dst = dst + 64; > } > COPY_BLOCK_64_BACK15: > if (n >= 32) { > n -= 32; > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + 32; > - dst = (uint8_t *)dst + 32; > + rte_mov32(dst, src); > + src = src + 32; > + dst = dst + 32; > } > if (n > 16) { > - rte_mov16((uint8_t *)dst, (const uint8_t *)src); > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t > *)src - 16 + n); > + rte_mov16(dst, src); > + rte_mov16(dst - 16 + n, src - 16 + n); > return ret; > } > - if (n > 0) { > - rte_mov16((uint8_t *)dst - 16 + n, (const uint8_t > *)src - 16 + n); > - } > + if (n > 0) > + rte_mov16(dst - 16 + n, src - 16 + n); > + > return ret; > } > > @@ -595,9 +605,9 @@ COPY_BLOCK_64_BACK15: > */ > dstofss = 16 - ((uintptr_t)dst & 0x0F) + 16; > n -= dstofss; > - rte_mov32((uint8_t *)dst, (const uint8_t *)src); > - src = (const uint8_t *)src + dstofss; > - dst = (uint8_t *)dst + dstofss; > + rte_mov32(dst, src); > + src = src + dstofss; > + dst = dst + dstofss; > srcofs = ((uintptr_t)src & 0x0F); > > /** > @@ -608,9 +618,9 @@ COPY_BLOCK_64_BACK15: > * Copy 256-byte blocks > */ > for (; n >= 256; n -= 256) { > - rte_mov256((uint8_t *)dst, (const uint8_t *)src); > - dst = (uint8_t *)dst + 256; > - src = (const uint8_t *)src + 256; > + rte_mov256(dst, src); > + dst = dst + 256; > + src = src + 256; > } > > /** > -- > 1.9.1 > >