From: Min Zhou <zhoumin@loongson.cn>
To: thomas@monjalon.net, david.marchand@redhat.com,
bruce.richardson@intel.com, anatoly.burakov@intel.com,
qiming.yang@intel.com, Yuying.Zhang@intel.com,
jgrajcia@cisco.com, konstantin.v.ananyev@yandex.ru
Cc: dev@dpdk.org, maobibo@loongson.cn
Subject: [v3 07/24] eal/loongarch: add dummy vector memcpy for LoongArch
Date: Mon, 6 Jun 2022 21:10:37 +0800 [thread overview]
Message-ID: <20220606131054.2097526-8-zhoumin@loongson.cn> (raw)
In-Reply-To: <20220606131054.2097526-1-zhoumin@loongson.cn>
The hardware instructions based vector implementation for memcpy
will come later. At present, this dummy implementation can also
work.
Signed-off-by: Min Zhou <zhoumin@loongson.cn>
---
lib/eal/loongarch/include/rte_memcpy.h | 193 +++++++++++++++++++++++++
lib/eal/loongarch/include/rte_vect.h | 46 ++++++
2 files changed, 239 insertions(+)
create mode 100644 lib/eal/loongarch/include/rte_memcpy.h
create mode 100644 lib/eal/loongarch/include/rte_vect.h
diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
new file mode 100644
index 0000000000..98dc3dfc3b
--- /dev/null
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Loongson Technology Corporation Limited
+ */
+
+#ifndef _RTE_MEMCPY_LOONGARCH_H_
+#define _RTE_MEMCPY_LOONGARCH_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcpy.h"
+
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+ *(xmm_t *)dst = *(const xmm_t *)src;
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+ rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+ rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+ rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+ rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+ rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+ rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+ rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+ rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+ rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+ rte_mov128(dst, src);
+ rte_mov128(dst + 128, src + 128);
+}
+
+#define rte_memcpy(dst, src, n) \
+ rte_memcpy_func((dst), (src), (n))
+
+static inline void *
+rte_memcpy_func(void *dst, const void *src, size_t n)
+{
+ void *ret = dst;
+
+ /* We can't copy < 16 bytes using XMM registers so do it manually. */
+ if (n < 16) {
+ if (n & 0x01) {
+ *(uint8_t *)dst = *(const uint8_t *)src;
+ dst = (uint8_t *)dst + 1;
+ src = (const uint8_t *)src + 1;
+ }
+ if (n & 0x02) {
+ *(uint16_t *)dst = *(const uint16_t *)src;
+ dst = (uint16_t *)dst + 1;
+ src = (const uint16_t *)src + 1;
+ }
+ if (n & 0x04) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ dst = (uint32_t *)dst + 1;
+ src = (const uint32_t *)src + 1;
+ }
+ if (n & 0x08)
+ *(uint64_t *)dst = *(const uint64_t *)src;
+ return ret;
+ }
+
+ /* Special fast cases for <= 128 bytes */
+ if (n <= 32) {
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov16((uint8_t *)dst - 16 + n,
+ (const uint8_t *)src - 16 + n);
+ return ret;
+ }
+
+ if (n <= 64) {
+ rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov32((uint8_t *)dst - 32 + n,
+ (const uint8_t *)src - 32 + n);
+ return ret;
+ }
+
+ if (n <= 128) {
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ rte_mov64((uint8_t *)dst - 64 + n,
+ (const uint8_t *)src - 64 + n);
+ return ret;
+ }
+
+ /*
+ * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
+ * copies was found to be faster than doing 128 and 32 byte copies as
+ * well.
+ */
+ for ( ; n >= 256; n -= 256) {
+ rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+ dst = (uint8_t *)dst + 256;
+ src = (const uint8_t *)src + 256;
+ }
+
+ /*
+ * We split the remaining bytes (which will be less than 256) into
+ * 64byte (2^6) chunks.
+ * Using incrementing integers in the case labels of a switch statement
+ * encourages the compiler to use a jump table. To get incrementing
+ * integers, we shift the 2 relevant bits to the LSB position to first
+ * get decrementing integers, and then subtract.
+ */
+ switch (3 - (n >> 6)) {
+ case 0x00:
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ n -= 64;
+ dst = (uint8_t *)dst + 64;
+ src = (const uint8_t *)src + 64; /* fallthrough */
+ case 0x01:
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ n -= 64;
+ dst = (uint8_t *)dst + 64;
+ src = (const uint8_t *)src + 64; /* fallthrough */
+ case 0x02:
+ rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+ n -= 64;
+ dst = (uint8_t *)dst + 64;
+ src = (const uint8_t *)src + 64; /* fallthrough */
+ default:
+ break;
+ }
+
+ /*
+ * We split the remaining bytes (which will be less than 64) into
+ * 16byte (2^4) chunks, using the same switch structure as above.
+ */
+ switch (3 - (n >> 4)) {
+ case 0x00:
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ n -= 16;
+ dst = (uint8_t *)dst + 16;
+ src = (const uint8_t *)src + 16; /* fallthrough */
+ case 0x01:
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ n -= 16;
+ dst = (uint8_t *)dst + 16;
+ src = (const uint8_t *)src + 16; /* fallthrough */
+ case 0x02:
+ rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+ n -= 16;
+ dst = (uint8_t *)dst + 16;
+ src = (const uint8_t *)src + 16; /* fallthrough */
+ default:
+ break;
+ }
+
+ /* Copy any remaining bytes, without going beyond end of buffers */
+ if (n != 0)
+ rte_mov16((uint8_t *)dst - 16 + n,
+ (const uint8_t *)src - 16 + n);
+ return ret;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCPY_LOONGARCH_H_ */
diff --git a/lib/eal/loongarch/include/rte_vect.h b/lib/eal/loongarch/include/rte_vect.h
new file mode 100644
index 0000000000..3e96fdd958
--- /dev/null
+++ b/lib/eal/loongarch/include/rte_vect.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Loongson Technology Corporation Limited
+ */
+
+#ifndef _RTE_VECT_LOONGARCH_H_
+#define _RTE_VECT_LOONGARCH_H_
+
+#include <stdint.h>
+#include "rte_common.h"
+#include "generic/rte_vect.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_VECT_DEFAULT_SIMD_BITWIDTH RTE_VECT_SIMD_256
+
+typedef union xmm {
+ int8_t i8[16];
+ int16_t i16[8];
+ int32_t i32[4];
+ int64_t i64[2];
+ uint8_t u8[16];
+ uint16_t u16[8];
+ uint32_t u32[4];
+ uint64_t u64[2];
+ double pd[2];
+} __rte_aligned(16) xmm_t;
+
+#define XMM_SIZE (sizeof(xmm_t))
+#define XMM_MASK (XMM_SIZE - 1)
+
+typedef union rte_xmm {
+ xmm_t x;
+ uint8_t u8[XMM_SIZE / sizeof(uint8_t)];
+ uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
+ uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
+ uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
+ double pd[XMM_SIZE / sizeof(double)];
+} __rte_aligned(16) rte_xmm_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--
2.31.1
next prev parent reply other threads:[~2022-06-06 13:11 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-06-06 13:10 [v3 00/24] Support LoongArch architecture Min Zhou
2022-06-06 13:10 ` [v3 01/24] eal/loongarch: add atomic operations for LoongArch Min Zhou
2022-07-20 10:16 ` zhoumin
2022-06-06 13:10 ` [v3 02/24] eal/loongarch: add byte order " Min Zhou
2022-06-06 13:10 ` [v3 03/24] eal/loongarch: add cpu cycle " Min Zhou
2022-06-06 13:10 ` [v3 04/24] eal/loongarch: add prefetch " Min Zhou
2022-06-06 13:10 ` [v3 05/24] eal/loongarch: add spinlock " Min Zhou
2022-06-06 13:10 ` [v3 06/24] eal/loongarch: add cpu flag checks " Min Zhou
2022-06-06 13:10 ` Min Zhou [this message]
2022-06-06 13:10 ` [v3 08/24] eal/loongarch: add io operations " Min Zhou
2022-06-06 13:10 ` [v3 09/24] eal/loongarch: add mcslock " Min Zhou
2022-06-06 13:10 ` [v3 10/24] eal/loongarch: add pause " Min Zhou
2022-06-06 13:10 ` [v3 11/24] eal/loongarch: add pflock " Min Zhou
2022-06-06 13:10 ` [v3 12/24] eal/loongarch: add rwlock " Min Zhou
2022-06-06 13:10 ` [v3 13/24] eal/loongarch: add ticketlock " Min Zhou
2022-06-06 13:10 ` [v3 14/24] eal/loongarch: add power " Min Zhou
2022-06-06 13:10 ` [v3 15/24] eal/loongarch: add hypervisor " Min Zhou
2022-06-06 13:10 ` [v3 16/24] mem: add huge page size definition " Min Zhou
2022-06-06 13:10 ` [v3 17/24] eal/linux: set eal base address " Min Zhou
2022-06-06 13:10 ` [v3 18/24] meson: introduce LoongArch architecture Min Zhou
2022-06-06 13:10 ` [v3 19/24] test/xmmt_ops: add dummy vector implementation for LoongArch Min Zhou
2022-06-06 13:10 ` [v3 20/24] ixgbe: " Min Zhou
2022-06-06 13:10 ` [v3 21/24] i40e: " Min Zhou
2022-06-06 13:10 ` [v3 22/24] tap: add system call number " Min Zhou
2022-06-06 13:10 ` [v3 23/24] memif: " Min Zhou
2022-06-06 13:10 ` [v3 24/24] maintainers: claim responsibility " Min Zhou
2022-07-20 16:33 ` [v3 00/24] Support LoongArch architecture David Marchand
2022-07-21 10:33 ` zhoumin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220606131054.2097526-8-zhoumin@loongson.cn \
--to=zhoumin@loongson.cn \
--cc=Yuying.Zhang@intel.com \
--cc=anatoly.burakov@intel.com \
--cc=bruce.richardson@intel.com \
--cc=david.marchand@redhat.com \
--cc=dev@dpdk.org \
--cc=jgrajcia@cisco.com \
--cc=konstantin.v.ananyev@yandex.ru \
--cc=maobibo@loongson.cn \
--cc=qiming.yang@intel.com \
--cc=thomas@monjalon.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).