From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 4E1DCA0544; Mon, 6 Jun 2022 15:11:11 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id E46D84021E; Mon, 6 Jun 2022 15:11:10 +0200 (CEST) Received: from loongson.cn (mail.loongson.cn [114.242.206.163]) by mails.dpdk.org (Postfix) with ESMTP id F185540150 for ; Mon, 6 Jun 2022 15:11:08 +0200 (CEST) Received: from localhost.localdomain (unknown [10.2.5.185]) by mail.loongson.cn (Coremail) with SMTP id AQAAf9BxieTe_J1imxwXAA--.64917S9; Mon, 06 Jun 2022 21:11:00 +0800 (CST) From: Min Zhou To: thomas@monjalon.net, david.marchand@redhat.com, bruce.richardson@intel.com, anatoly.burakov@intel.com, qiming.yang@intel.com, Yuying.Zhang@intel.com, jgrajcia@cisco.com, konstantin.v.ananyev@yandex.ru Cc: dev@dpdk.org, maobibo@loongson.cn Subject: [v3 07/24] eal/loongarch: add dummy vector memcpy for LoongArch Date: Mon, 6 Jun 2022 21:10:37 +0800 Message-Id: <20220606131054.2097526-8-zhoumin@loongson.cn> X-Mailer: git-send-email 2.31.1 In-Reply-To: <20220606131054.2097526-1-zhoumin@loongson.cn> References: <20220606131054.2097526-1-zhoumin@loongson.cn> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-CM-TRANSID: AQAAf9BxieTe_J1imxwXAA--.64917S9 X-Coremail-Antispam: 1UD129KBjvJXoWxKFy7Gr47Cr48ZF43Gr15Arb_yoW3Jr4xpa s8Cr4rXr1kGa1fXFn3Xw1UJ3W3t3Z7Zr1UKr4UZF1fAFs7A340grZrKrWrAFs5ua4xArW7 Xr48uan8WayUu3DanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnUUvcSsGvfC2KfnxnUUI43ZEXa7xR_UUUUUUUUU== X-CM-SenderInfo: 52kr3ztlq6z05rqj20fqof0/ X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org The hardware instructions based vector implementation for memcpy will come later. At present, this dummy implementation can also work. Signed-off-by: Min Zhou --- lib/eal/loongarch/include/rte_memcpy.h | 193 +++++++++++++++++++++++++ lib/eal/loongarch/include/rte_vect.h | 46 ++++++ 2 files changed, 239 insertions(+) create mode 100644 lib/eal/loongarch/include/rte_memcpy.h create mode 100644 lib/eal/loongarch/include/rte_vect.h diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h new file mode 100644 index 0000000000..98dc3dfc3b --- /dev/null +++ b/lib/eal/loongarch/include/rte_memcpy.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2022 Loongson Technology Corporation Limited + */ + +#ifndef _RTE_MEMCPY_LOONGARCH_H_ +#define _RTE_MEMCPY_LOONGARCH_H_ + +#include +#include +#include +#ifdef __cplusplus +extern "C" { +#endif + +#include "generic/rte_memcpy.h" + +static inline void +rte_mov16(uint8_t *dst, const uint8_t *src) +{ + *(xmm_t *)dst = *(const xmm_t *)src; +} + +static inline void +rte_mov32(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); +} + +static inline void +rte_mov48(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); +} + +static inline void +rte_mov64(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); +} + +static inline void +rte_mov128(uint8_t *dst, const uint8_t *src) +{ + rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16); + rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16); + rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16); + rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16); + rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16); + rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16); + rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16); + rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16); +} + +static inline void +rte_mov256(uint8_t *dst, const uint8_t *src) +{ + rte_mov128(dst, src); + rte_mov128(dst + 128, src + 128); +} + +#define rte_memcpy(dst, src, n) \ + rte_memcpy_func((dst), (src), (n)) + +static inline void * +rte_memcpy_func(void *dst, const void *src, size_t n) +{ + void *ret = dst; + + /* We can't copy < 16 bytes using XMM registers so do it manually. */ + if (n < 16) { + if (n & 0x01) { + *(uint8_t *)dst = *(const uint8_t *)src; + dst = (uint8_t *)dst + 1; + src = (const uint8_t *)src + 1; + } + if (n & 0x02) { + *(uint16_t *)dst = *(const uint16_t *)src; + dst = (uint16_t *)dst + 1; + src = (const uint16_t *)src + 1; + } + if (n & 0x04) { + *(uint32_t *)dst = *(const uint32_t *)src; + dst = (uint32_t *)dst + 1; + src = (const uint32_t *)src + 1; + } + if (n & 0x08) + *(uint64_t *)dst = *(const uint64_t *)src; + return ret; + } + + /* Special fast cases for <= 128 bytes */ + if (n <= 32) { + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; + } + + if (n <= 64) { + rte_mov32((uint8_t *)dst, (const uint8_t *)src); + rte_mov32((uint8_t *)dst - 32 + n, + (const uint8_t *)src - 32 + n); + return ret; + } + + if (n <= 128) { + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + rte_mov64((uint8_t *)dst - 64 + n, + (const uint8_t *)src - 64 + n); + return ret; + } + + /* + * For large copies > 128 bytes. This combination of 256, 64 and 16 byte + * copies was found to be faster than doing 128 and 32 byte copies as + * well. + */ + for ( ; n >= 256; n -= 256) { + rte_mov256((uint8_t *)dst, (const uint8_t *)src); + dst = (uint8_t *)dst + 256; + src = (const uint8_t *)src + 256; + } + + /* + * We split the remaining bytes (which will be less than 256) into + * 64byte (2^6) chunks. + * Using incrementing integers in the case labels of a switch statement + * encourages the compiler to use a jump table. To get incrementing + * integers, we shift the 2 relevant bits to the LSB position to first + * get decrementing integers, and then subtract. + */ + switch (3 - (n >> 6)) { + case 0x00: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x01: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + case 0x02: + rte_mov64((uint8_t *)dst, (const uint8_t *)src); + n -= 64; + dst = (uint8_t *)dst + 64; + src = (const uint8_t *)src + 64; /* fallthrough */ + default: + break; + } + + /* + * We split the remaining bytes (which will be less than 64) into + * 16byte (2^4) chunks, using the same switch structure as above. + */ + switch (3 - (n >> 4)) { + case 0x00: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x01: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + case 0x02: + rte_mov16((uint8_t *)dst, (const uint8_t *)src); + n -= 16; + dst = (uint8_t *)dst + 16; + src = (const uint8_t *)src + 16; /* fallthrough */ + default: + break; + } + + /* Copy any remaining bytes, without going beyond end of buffers */ + if (n != 0) + rte_mov16((uint8_t *)dst - 16 + n, + (const uint8_t *)src - 16 + n); + return ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_MEMCPY_LOONGARCH_H_ */ diff --git a/lib/eal/loongarch/include/rte_vect.h b/lib/eal/loongarch/include/rte_vect.h new file mode 100644 index 0000000000..3e96fdd958 --- /dev/null +++ b/lib/eal/loongarch/include/rte_vect.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2022 Loongson Technology Corporation Limited + */ + +#ifndef _RTE_VECT_LOONGARCH_H_ +#define _RTE_VECT_LOONGARCH_H_ + +#include +#include "rte_common.h" +#include "generic/rte_vect.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RTE_VECT_DEFAULT_SIMD_BITWIDTH RTE_VECT_SIMD_256 + +typedef union xmm { + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + int64_t i64[2]; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + double pd[2]; +} __rte_aligned(16) xmm_t; + +#define XMM_SIZE (sizeof(xmm_t)) +#define XMM_MASK (XMM_SIZE - 1) + +typedef union rte_xmm { + xmm_t x; + uint8_t u8[XMM_SIZE / sizeof(uint8_t)]; + uint16_t u16[XMM_SIZE / sizeof(uint16_t)]; + uint32_t u32[XMM_SIZE / sizeof(uint32_t)]; + uint64_t u64[XMM_SIZE / sizeof(uint64_t)]; + double pd[XMM_SIZE / sizeof(double)]; +} __rte_aligned(16) rte_xmm_t; + +#ifdef __cplusplus +} +#endif + +#endif -- 2.31.1