DPDK patches and discussions
 help / color / mirror / Atom feed
From: Min Zhou <zhoumin@loongson.cn>
To: thomas@monjalon.net, david.marchand@redhat.com,
	bruce.richardson@intel.com, anatoly.burakov@intel.com,
	qiming.yang@intel.com, Yuying.Zhang@intel.com,
	jgrajcia@cisco.com, konstantin.v.ananyev@yandex.ru
Cc: dev@dpdk.org, maobibo@loongson.cn
Subject: [v3 07/24] eal/loongarch: add dummy vector memcpy for LoongArch
Date: Mon,  6 Jun 2022 21:10:37 +0800	[thread overview]
Message-ID: <20220606131054.2097526-8-zhoumin@loongson.cn> (raw)
In-Reply-To: <20220606131054.2097526-1-zhoumin@loongson.cn>

The hardware instructions based vector implementation for memcpy
will come later. At present, this dummy implementation can also
work.

Signed-off-by: Min Zhou <zhoumin@loongson.cn>
---
 lib/eal/loongarch/include/rte_memcpy.h | 193 +++++++++++++++++++++++++
 lib/eal/loongarch/include/rte_vect.h   |  46 ++++++
 2 files changed, 239 insertions(+)
 create mode 100644 lib/eal/loongarch/include/rte_memcpy.h
 create mode 100644 lib/eal/loongarch/include/rte_vect.h

diff --git a/lib/eal/loongarch/include/rte_memcpy.h b/lib/eal/loongarch/include/rte_memcpy.h
new file mode 100644
index 0000000000..98dc3dfc3b
--- /dev/null
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Loongson Technology Corporation Limited
+ */
+
+#ifndef _RTE_MEMCPY_LOONGARCH_H_
+#define _RTE_MEMCPY_LOONGARCH_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <rte_vect.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "generic/rte_memcpy.h"
+
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	*(xmm_t *)dst = *(const xmm_t *)src;
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov16((uint8_t *)dst + 0 * 16, (const uint8_t *)src + 0 * 16);
+	rte_mov16((uint8_t *)dst + 1 * 16, (const uint8_t *)src + 1 * 16);
+	rte_mov16((uint8_t *)dst + 2 * 16, (const uint8_t *)src + 2 * 16);
+	rte_mov16((uint8_t *)dst + 3 * 16, (const uint8_t *)src + 3 * 16);
+	rte_mov16((uint8_t *)dst + 4 * 16, (const uint8_t *)src + 4 * 16);
+	rte_mov16((uint8_t *)dst + 5 * 16, (const uint8_t *)src + 5 * 16);
+	rte_mov16((uint8_t *)dst + 6 * 16, (const uint8_t *)src + 6 * 16);
+	rte_mov16((uint8_t *)dst + 7 * 16, (const uint8_t *)src + 7 * 16);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+#define rte_memcpy(dst, src, n)      \
+	rte_memcpy_func((dst), (src), (n))
+
+static inline void *
+rte_memcpy_func(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+
+	/* We can't copy < 16 bytes using XMM registers so do it manually. */
+	if (n < 16) {
+		if (n & 0x01) {
+			*(uint8_t *)dst = *(const uint8_t *)src;
+			dst = (uint8_t *)dst + 1;
+			src = (const uint8_t *)src + 1;
+		}
+		if (n & 0x02) {
+			*(uint16_t *)dst = *(const uint16_t *)src;
+			dst = (uint16_t *)dst + 1;
+			src = (const uint16_t *)src + 1;
+		}
+		if (n & 0x04) {
+			*(uint32_t *)dst = *(const uint32_t *)src;
+			dst = (uint32_t *)dst + 1;
+			src = (const uint32_t *)src + 1;
+		}
+		if (n & 0x08)
+			*(uint64_t *)dst = *(const uint64_t *)src;
+		return ret;
+	}
+
+	/* Special fast cases for <= 128 bytes */
+	if (n <= 32) {
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov16((uint8_t *)dst - 16 + n,
+			(const uint8_t *)src - 16 + n);
+		return ret;
+	}
+
+	if (n <= 64) {
+		rte_mov32((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov32((uint8_t *)dst - 32 + n,
+			(const uint8_t *)src - 32 + n);
+		return ret;
+	}
+
+	if (n <= 128) {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		rte_mov64((uint8_t *)dst - 64 + n,
+			(const uint8_t *)src - 64 + n);
+		return ret;
+	}
+
+	/*
+	 * For large copies > 128 bytes. This combination of 256, 64 and 16 byte
+	 * copies was found to be faster than doing 128 and 32 byte copies as
+	 * well.
+	 */
+	for ( ; n >= 256; n -= 256) {
+		rte_mov256((uint8_t *)dst, (const uint8_t *)src);
+		dst = (uint8_t *)dst + 256;
+		src = (const uint8_t *)src + 256;
+	}
+
+	/*
+	 * We split the remaining bytes (which will be less than 256) into
+	 * 64byte (2^6) chunks.
+	 * Using incrementing integers in the case labels of a switch statement
+	 * encourages the compiler to use a jump table. To get incrementing
+	 * integers, we shift the 2 relevant bits to the LSB position to first
+	 * get decrementing integers, and then subtract.
+	 */
+	switch (3 - (n >> 6)) {
+	case 0x00:
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		n -= 64;
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;      /* fallthrough */
+	case 0x01:
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		n -= 64;
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;      /* fallthrough */
+	case 0x02:
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		n -= 64;
+		dst = (uint8_t *)dst + 64;
+		src = (const uint8_t *)src + 64;      /* fallthrough */
+	default:
+		break;
+	}
+
+	/*
+	 * We split the remaining bytes (which will be less than 64) into
+	 * 16byte (2^4) chunks, using the same switch structure as above.
+	 */
+	switch (3 - (n >> 4)) {
+	case 0x00:
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		n -= 16;
+		dst = (uint8_t *)dst + 16;
+		src = (const uint8_t *)src + 16;      /* fallthrough */
+	case 0x01:
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		n -= 16;
+		dst = (uint8_t *)dst + 16;
+		src = (const uint8_t *)src + 16;      /* fallthrough */
+	case 0x02:
+		rte_mov16((uint8_t *)dst, (const uint8_t *)src);
+		n -= 16;
+		dst = (uint8_t *)dst + 16;
+		src = (const uint8_t *)src + 16;      /* fallthrough */
+	default:
+		break;
+	}
+
+	/* Copy any remaining bytes, without going beyond end of buffers */
+	if (n != 0)
+		rte_mov16((uint8_t *)dst - 16 + n,
+			(const uint8_t *)src - 16 + n);
+	return ret;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_MEMCPY_LOONGARCH_H_ */
diff --git a/lib/eal/loongarch/include/rte_vect.h b/lib/eal/loongarch/include/rte_vect.h
new file mode 100644
index 0000000000..3e96fdd958
--- /dev/null
+++ b/lib/eal/loongarch/include/rte_vect.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Loongson Technology Corporation Limited
+ */
+
+#ifndef _RTE_VECT_LOONGARCH_H_
+#define _RTE_VECT_LOONGARCH_H_
+
+#include <stdint.h>
+#include "rte_common.h"
+#include "generic/rte_vect.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RTE_VECT_DEFAULT_SIMD_BITWIDTH RTE_VECT_SIMD_256
+
+typedef union xmm {
+	int8_t   i8[16];
+	int16_t  i16[8];
+	int32_t  i32[4];
+	int64_t  i64[2];
+	uint8_t  u8[16];
+	uint16_t u16[8];
+	uint32_t u32[4];
+	uint64_t u64[2];
+	double   pd[2];
+} __rte_aligned(16) xmm_t;
+
+#define XMM_SIZE        (sizeof(xmm_t))
+#define XMM_MASK        (XMM_SIZE - 1)
+
+typedef union rte_xmm {
+	xmm_t x;
+	uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
+	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
+	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
+	uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
+	double   pd[XMM_SIZE / sizeof(double)];
+} __rte_aligned(16) rte_xmm_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
-- 
2.31.1


  parent reply	other threads:[~2022-06-06 13:11 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-06 13:10 [v3 00/24] Support LoongArch architecture Min Zhou
2022-06-06 13:10 ` [v3 01/24] eal/loongarch: add atomic operations for LoongArch Min Zhou
2022-07-20 10:16   ` zhoumin
2022-06-06 13:10 ` [v3 02/24] eal/loongarch: add byte order " Min Zhou
2022-06-06 13:10 ` [v3 03/24] eal/loongarch: add cpu cycle " Min Zhou
2022-06-06 13:10 ` [v3 04/24] eal/loongarch: add prefetch " Min Zhou
2022-06-06 13:10 ` [v3 05/24] eal/loongarch: add spinlock " Min Zhou
2022-06-06 13:10 ` [v3 06/24] eal/loongarch: add cpu flag checks " Min Zhou
2022-06-06 13:10 ` Min Zhou [this message]
2022-06-06 13:10 ` [v3 08/24] eal/loongarch: add io operations " Min Zhou
2022-06-06 13:10 ` [v3 09/24] eal/loongarch: add mcslock " Min Zhou
2022-06-06 13:10 ` [v3 10/24] eal/loongarch: add pause " Min Zhou
2022-06-06 13:10 ` [v3 11/24] eal/loongarch: add pflock " Min Zhou
2022-06-06 13:10 ` [v3 12/24] eal/loongarch: add rwlock " Min Zhou
2022-06-06 13:10 ` [v3 13/24] eal/loongarch: add ticketlock " Min Zhou
2022-06-06 13:10 ` [v3 14/24] eal/loongarch: add power " Min Zhou
2022-06-06 13:10 ` [v3 15/24] eal/loongarch: add hypervisor " Min Zhou
2022-06-06 13:10 ` [v3 16/24] mem: add huge page size definition " Min Zhou
2022-06-06 13:10 ` [v3 17/24] eal/linux: set eal base address " Min Zhou
2022-06-06 13:10 ` [v3 18/24] meson: introduce LoongArch architecture Min Zhou
2022-06-06 13:10 ` [v3 19/24] test/xmmt_ops: add dummy vector implementation for LoongArch Min Zhou
2022-06-06 13:10 ` [v3 20/24] ixgbe: " Min Zhou
2022-06-06 13:10 ` [v3 21/24] i40e: " Min Zhou
2022-06-06 13:10 ` [v3 22/24] tap: add system call number " Min Zhou
2022-06-06 13:10 ` [v3 23/24] memif: " Min Zhou
2022-06-06 13:10 ` [v3 24/24] maintainers: claim responsibility " Min Zhou
2022-07-20 16:33 ` [v3 00/24] Support LoongArch architecture David Marchand
2022-07-21 10:33   ` zhoumin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220606131054.2097526-8-zhoumin@loongson.cn \
    --to=zhoumin@loongson.cn \
    --cc=Yuying.Zhang@intel.com \
    --cc=anatoly.burakov@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=david.marchand@redhat.com \
    --cc=dev@dpdk.org \
    --cc=jgrajcia@cisco.com \
    --cc=konstantin.v.ananyev@yandex.ru \
    --cc=maobibo@loongson.cn \
    --cc=qiming.yang@intel.com \
    --cc=thomas@monjalon.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).