DPDK patches and discussions
 help / color / mirror / Atom feed
From: Marvin Liu <yong.liu@intel.com>
To: maxime.coquelin@redhat.com, chenbo.xia@intel.com, zhihong.wang@intel.com
Cc: dev@dpdk.org, Marvin Liu <yong.liu@intel.com>
Subject: [dpdk-dev] [PATCH v3 4/5] vhost: add packed ring vectorized dequeue
Date: Fri,  9 Oct 2020 16:14:09 +0800
Message-ID: <20201009081410.63944-5-yong.liu@intel.com> (raw)
In-Reply-To: <20201009081410.63944-1-yong.liu@intel.com>

Optimize vhost packed ring dequeue path with SIMD instructions. Four
descriptors status check and writeback are batched handled with AVX512
instructions. Address translation operations are also accelerated by
AVX512 instructions.

If platform or compiler not support vectorization, will fallback to
default path.

Signed-off-by: Marvin Liu <yong.liu@intel.com>

diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
index cc9aa65c6..5eadcbae4 100644
--- a/lib/librte_vhost/meson.build
+++ b/lib/librte_vhost/meson.build
@@ -8,6 +8,22 @@ endif
 if has_libnuma == 1
 	dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
 endif
+
+if arch_subdir == 'x86'
+        if not machine_args.contains('-mno-avx512f')
+                if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
+                        cflags += ['-DCC_AVX512_SUPPORT']
+                        vhost_avx512_lib = static_library('vhost_avx512_lib',
+                                              'virtio_net_avx.c',
+                                              dependencies: [static_rte_eal, static_rte_mempool,
+                                                  static_rte_mbuf, static_rte_ethdev, static_rte_net],
+                                              include_directories: includes,
+                                              c_args: [cflags, '-mavx512f', '-mavx512bw', '-mavx512vl'])
+                        objs += vhost_avx512_lib.extract_objects('virtio_net_avx.c')
+                endif
+        endif
+endif
+
 if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
 	cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
 elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index a19fe9423..b270c424b 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -1100,4 +1100,15 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
 
 	return NULL;
 }
+
+int
+vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mempool *mbuf_pool,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint16_t *ids);
+
+
 #endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 9757ed053..3bc6b9b20 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -2136,6 +2136,28 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
 	return -1;
 }
 
+static __rte_always_inline int
+vhost_handle_avail_batch_packed(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mempool *mbuf_pool,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint16_t *ids)
+{
+#ifdef CC_AVX512_SUPPORT
+	if (unlikely(dev->vectorized))
+		return vhost_reserve_avail_batch_packed_avx(dev, vq, mbuf_pool,
+				pkts, avail_idx, desc_addrs, ids);
+	else
+		return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool,
+				pkts, avail_idx, desc_addrs, ids);
+#else
+	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
+			avail_idx, desc_addrs, ids);
+#endif
+}
+
 static __rte_always_inline int
 virtio_dev_tx_batch_packed(struct virtio_net *dev,
 			   struct vhost_virtqueue *vq,
@@ -2148,8 +2170,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev,
 	uint16_t ids[PACKED_BATCH_SIZE];
 	uint16_t i;
 
-	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
-					     avail_idx, desc_addrs, ids))
+
+	if (vhost_handle_avail_batch_packed(dev, vq, mbuf_pool, pkts,
+		avail_idx, desc_addrs, ids))
 		return -1;
 
 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
diff --git a/lib/librte_vhost/virtio_net_avx.c b/lib/librte_vhost/virtio_net_avx.c
new file mode 100644
index 000000000..e10b2a285
--- /dev/null
+++ b/lib/librte_vhost/virtio_net_avx.c
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ */
+#include <stdint.h>
+
+#include "vhost.h"
+
+#define BYTE_SIZE 8
+/* reference count offset in mbuf rearm data */
+#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+/* segment number offset in mbuf rearm data */
+#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+
+/* default rearm data */
+#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
+	1ULL << REFCNT_BITS_OFFSET)
+
+#define DESC_FLAGS_SHORT_OFFSET (offsetof(struct vring_packed_desc, flags) / \
+	sizeof(uint16_t))
+
+#define DESC_FLAGS_SHORT_SIZE (sizeof(struct vring_packed_desc) / \
+	sizeof(uint16_t))
+#define BATCH_FLAGS_MASK (1 << DESC_FLAGS_SHORT_OFFSET | \
+	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE) | \
+	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 2)  | \
+	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 3))
+
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+#define PACKED_FLAGS_MASK ((0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) \
+	<< FLAGS_BITS_OFFSET)
+#define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) << FLAGS_BITS_OFFSET)
+#define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
+	FLAGS_BITS_OFFSET)
+
+#define DESC_FLAGS_POS 0xaa
+#define MBUF_LENS_POS 0x6666
+
+int
+vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mempool *mbuf_pool,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint16_t *ids)
+{
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint32_t descs_status;
+	void *desc_addr;
+	uint16_t i;
+	uint8_t cmp_low, cmp_high, cmp_result;
+	uint64_t lens[PACKED_BATCH_SIZE];
+	struct virtio_net_hdr *hdr;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
+		return -1;
+
+	/* load 4 descs */
+	desc_addr = &vq->desc_packed[avail_idx];
+	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
+
+	/* burst check four status */
+	__m512i avail_flag_vec;
+	if (vq->avail_wrap_counter)
+#if defined(RTE_ARCH_I686)
+		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG, 0x0,
+					PACKED_FLAGS_MASK, 0x0);
+#else
+		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
+					PACKED_AVAIL_FLAG);
+
+#endif
+	else
+#if defined(RTE_ARCH_I686)
+		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG_WRAP,
+					0x0, PACKED_AVAIL_FLAG_WRAP, 0x0);
+#else
+		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
+					PACKED_AVAIL_FLAG_WRAP);
+#endif
+
+	descs_status = _mm512_cmp_epu16_mask(desc_vec, avail_flag_vec,
+		_MM_CMPINT_NE);
+	if (descs_status & BATCH_FLAGS_MASK)
+		return -1;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			uint64_t size = (uint64_t)descs[avail_idx + i].len;
+			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
+				descs[avail_idx + i].addr, &size,
+				VHOST_ACCESS_RO);
+
+			if (!desc_addrs[i])
+				goto free_buf;
+			lens[i] = descs[avail_idx + i].len;
+			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+
+			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
+					lens[i]);
+			if (!pkts[i])
+				goto free_buf;
+		}
+	} else {
+		/* check buffer fit into one region & translate address */
+		struct mem_regions_range *range = dev->regions_range;
+		__m512i regions_low_addrs =
+			_mm512_loadu_si512((void *)&range->regions_low_addrs);
+		__m512i regions_high_addrs =
+			_mm512_loadu_si512((void *)&range->regions_high_addrs);
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			uint64_t addr_low = descs[avail_idx + i].addr;
+			uint64_t addr_high = addr_low +
+						descs[avail_idx + i].len;
+			__m512i low_addr_vec = _mm512_set1_epi64(addr_low);
+			__m512i high_addr_vec = _mm512_set1_epi64(addr_high);
+
+			cmp_low = _mm512_cmp_epi64_mask(low_addr_vec,
+					regions_low_addrs, _MM_CMPINT_NLT);
+			cmp_high = _mm512_cmp_epi64_mask(high_addr_vec,
+					regions_high_addrs, _MM_CMPINT_LT);
+			cmp_result = cmp_low & cmp_high;
+			int index = __builtin_ctz(cmp_result);
+			if (unlikely((uint32_t)index >= dev->mem->nregions))
+				goto free_buf;
+
+			desc_addrs[i] = addr_low +
+				dev->mem->regions[index].host_user_addr -
+				dev->mem->regions[index].guest_phys_addr;
+			lens[i] = descs[avail_idx + i].len;
+			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+
+			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
+					lens[i]);
+			if (!pkts[i])
+				goto free_buf;
+		}
+	}
+
+	if (virtio_net_with_host_offload(dev)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
+			vhost_dequeue_offload(hdr, pkts[i]);
+		}
+	}
+
+	if (virtio_net_is_inorder(dev)) {
+		ids[PACKED_BATCH_SIZE - 1] =
+			descs[avail_idx + PACKED_BATCH_SIZE - 1].id;
+	} else {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			ids[i] = descs[avail_idx + i].id;
+	}
+
+	uint64_t addrs[PACKED_BATCH_SIZE << 1];
+	/* store mbuf data_len, pkt_len */
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		addrs[i << 1] = (uint64_t)pkts[i]->rx_descriptor_fields1;
+		addrs[(i << 1) + 1] = (uint64_t)pkts[i]->rx_descriptor_fields1
+					+ sizeof(uint64_t);
+	}
+
+	/* save pkt_len and data_len into mbufs */
+	__m512i value_vec = _mm512_maskz_shuffle_epi32(MBUF_LENS_POS, desc_vec,
+					0xAA);
+	__m512i offsets_vec = _mm512_maskz_set1_epi32(MBUF_LENS_POS,
+					(uint32_t)-12);
+	value_vec = _mm512_add_epi32(value_vec, offsets_vec);
+	__m512i vindex = _mm512_loadu_si512((void *)addrs);
+	_mm512_i64scatter_epi64(0, vindex, value_vec, 1);
+
+	return 0;
+free_buf:
+	for (i = 0; i < PACKED_BATCH_SIZE; i++)
+		rte_pktmbuf_free(pkts[i]);
+
+	return -1;
+}
-- 
2.17.1


  parent reply	other threads:[~2020-10-09  8:21 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-19  3:24 [dpdk-dev] [PATCH v1 0/5] vhost add vectorized data path Marvin Liu
2020-08-19  3:24 ` [dpdk-dev] [PATCH v1 1/5] vhost: " Marvin Liu
2020-09-21  6:48   ` [dpdk-dev] [PATCH v2 0/5] vhost " Marvin Liu
2020-09-21  6:48     ` [dpdk-dev] [PATCH v2 1/5] vhost: " Marvin Liu
2020-09-21  6:48     ` [dpdk-dev] [PATCH v2 2/5] vhost: reuse packed ring functions Marvin Liu
2020-09-21  6:48     ` [dpdk-dev] [PATCH v2 3/5] vhost: prepare memory regions addresses Marvin Liu
2020-10-06 15:06       ` Maxime Coquelin
2020-09-21  6:48     ` [dpdk-dev] [PATCH v2 4/5] vhost: add packed ring vectorized dequeue Marvin Liu
2020-10-06 14:59       ` Maxime Coquelin
2020-10-08  7:05         ` Liu, Yong
2020-10-06 15:18       ` Maxime Coquelin
2020-10-09  7:59         ` Liu, Yong
2020-09-21  6:48     ` [dpdk-dev] [PATCH v2 5/5] vhost: add packed ring vectorized enqueue Marvin Liu
2020-10-06 15:00       ` Maxime Coquelin
2020-10-08  7:09         ` Liu, Yong
2020-10-06 13:34     ` [dpdk-dev] [PATCH v2 0/5] vhost add vectorized data path Maxime Coquelin
2020-10-08  6:20       ` Liu, Yong
2020-10-09  8:14   ` [dpdk-dev] [PATCH v3 " Marvin Liu
2020-10-09  8:14     ` [dpdk-dev] [PATCH v3 1/5] vhost: " Marvin Liu
2020-10-09  8:14     ` [dpdk-dev] [PATCH v3 2/5] vhost: reuse packed ring functions Marvin Liu
2020-10-09  8:14     ` [dpdk-dev] [PATCH v3 3/5] vhost: prepare memory regions addresses Marvin Liu
2020-10-09  8:14     ` Marvin Liu [this message]
2020-10-09  8:14     ` [dpdk-dev] [PATCH v3 5/5] vhost: add packed ring vectorized enqueue Marvin Liu
2020-10-12  8:21     ` [dpdk-dev] [PATCH v3 0/5] vhost add vectorized data path Maxime Coquelin
2020-10-12  9:10       ` Liu, Yong
2020-10-12  9:57         ` Maxime Coquelin
2020-10-12 13:24           ` Liu, Yong
2020-10-15 15:28       ` Liu, Yong
2020-10-15 15:35         ` Maxime Coquelin
2020-08-19  3:24 ` [dpdk-dev] [PATCH v1 2/5] vhost: reuse packed ring functions Marvin Liu
2020-08-19  3:24 ` [dpdk-dev] [PATCH v1 3/5] vhost: prepare memory regions addresses Marvin Liu
2020-08-19  3:24 ` [dpdk-dev] [PATCH v1 4/5] vhost: add packed ring vectorized dequeue Marvin Liu
2020-09-18 13:44   ` Maxime Coquelin
2020-09-21  6:26     ` Liu, Yong
2020-09-21  7:47       ` Liu, Yong
2020-08-19  3:24 ` [dpdk-dev] [PATCH v1 5/5] vhost: add packed ring vectorized enqueue Marvin Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201009081410.63944-5-yong.liu@intel.com \
    --to=yong.liu@intel.com \
    --cc=chenbo.xia@intel.com \
    --cc=dev@dpdk.org \
    --cc=maxime.coquelin@redhat.com \
    --cc=zhihong.wang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ https://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git