DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON
@ 2020-09-11 12:09 Joyce Kong
  2020-09-11 12:09 ` [dpdk-dev] [RFC 1/3] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: Joyce Kong @ 2020-09-11 12:09 UTC (permalink / raw)
  To: maxime.coquelin
  Cc: jerinj, dev, nd, honnappa.nagarahalli, ruifeng.wang, phil.yang

This patch set introduced vectorized RX path for packed ring
with NEON.

Joyce Kong (3):
  net/virtio: move AVX based Rx and Tx code to separate file
  net/virtio: add vectorized packed ring Rx NEON path
  net/virtio: add election for packed vector Rx NEON path

 doc/guides/nics/virtio.rst                   |   4 +-
 drivers/net/virtio/meson.build               |   2 +
 drivers/net/virtio/virtio_ethdev.c           |  19 +-
 drivers/net/virtio/virtio_rxtx.c             |   7 +-
 drivers/net/virtio/virtio_rxtx_packed.c      |  37 +++
 drivers/net/virtio/virtio_rxtx_packed.h      | 300 +++++++++++++++++++
 drivers/net/virtio/virtio_rxtx_packed_avx.c  | 264 +---------------
 drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++
 drivers/net/virtio/virtio_user_ethdev.c      |   2 +
 9 files changed, 566 insertions(+), 271 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c

-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [RFC 1/3] net/virtio: move AVX based Rx and Tx code to separate file
  2020-09-11 12:09 [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Joyce Kong
@ 2020-09-11 12:09 ` Joyce Kong
  2020-09-11 12:09 ` [dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2020-09-11 12:09 UTC (permalink / raw)
  To: maxime.coquelin
  Cc: jerinj, dev, nd, honnappa.nagarahalli, ruifeng.wang, phil.yang

Split out AVX instruction based virtio packed ring Rx and Tx
implementation to a separate file.

Signed-off-by: Phil Yang <phil.yang@arm.com>
---
 drivers/net/virtio/meson.build              |   1 +
 drivers/net/virtio/virtio_rxtx_packed.c     |  37 +++
 drivers/net/virtio/virtio_rxtx_packed.h     | 284 ++++++++++++++++++++
 drivers/net/virtio/virtio_rxtx_packed_avx.c | 264 +-----------------
 4 files changed, 323 insertions(+), 263 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h

diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
index 3fd6051f4..e1851b0a6 100644
--- a/drivers/net/virtio/meson.build
+++ b/drivers/net/virtio/meson.build
@@ -5,6 +5,7 @@ sources += files('virtio_ethdev.c',
 	'virtio_pci.c',
 	'virtio_rxtx.c',
 	'virtio_rxtx_simple.c',
+	'virtio_rxtx_packed.c',
 	'virtqueue.c')
 deps += ['kvargs', 'bus_pci']
 
diff --git a/drivers/net/virtio/virtio_rxtx_packed.c b/drivers/net/virtio/virtio_rxtx_packed.c
new file mode 100644
index 000000000..e614e19fc
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed.c
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+/* Stub for linkage when arch specific implementation is not available */
+__rte_weak uint16_t
+virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,
+			struct rte_mbuf **tx_pkts __rte_unused,
+			uint16_t nb_pkts __rte_unused)
+{
+	rte_panic("Wrong weak function linked by linker\n");
+	return 0;
+}
+
+/* Stub for linkage when arch specific implementation is not available */
+__rte_weak uint16_t
+virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
+			    struct rte_mbuf **rx_pkts __rte_unused,
+			    uint16_t nb_pkts __rte_unused)
+{
+	rte_panic("Wrong weak function linked by linker\n");
+	return 0;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
new file mode 100644
index 000000000..b2447843b
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed.h
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _VIRTIO_RXTX_PACKED_H_
+#define _VIRTIO_RXTX_PACKED_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+#define BYTE_SIZE 8
+/* flag bits offset in packed ring desc higher 64bits */
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
+	FLAGS_BITS_OFFSET)
+
+/* reference count offset in mbuf rearm data */
+#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+/* segment number offset in mbuf rearm data */
+#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+
+/* default rearm data */
+#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
+	1ULL << REFCNT_BITS_OFFSET)
+
+/* id bits offset in packed ring desc higher 64bits */
+#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+/* net hdr short size mask */
+#define NET_HDR_MASK 0x3F
+
+#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
+	sizeof(struct vring_packed_desc))
+#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
+
+#ifdef VIRTIO_GCC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_ICC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifndef virtio_for_each_try_unroll
+#define virtio_for_each_try_unroll(iter, val, num) \
+	for (iter = val; iter < num; iter++)
+#endif
+
+static inline void
+virtio_update_batch_stats(struct virtnet_stats *stats,
+			  uint16_t pkt_len1,
+			  uint16_t pkt_len2,
+			  uint16_t pkt_len3,
+			  uint16_t pkt_len4)
+{
+	stats->bytes += pkt_len1;
+	stats->bytes += pkt_len2;
+	stats->bytes += pkt_len3;
+	stats->bytes += pkt_len4;
+}
+
+static inline int
+virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq,
+				    struct rte_mbuf *txm)
+{
+	struct virtqueue *vq = txvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t hdr_size = hw->vtnet_hdr_size;
+	uint16_t slots, can_push;
+	int16_t need;
+
+	/* How many main ring entries are needed to this Tx?
+	 * any_layout => number of segments
+	 * default    => number of segments + 1
+	 */
+	can_push = rte_mbuf_refcnt_read(txm) == 1 &&
+		   RTE_MBUF_DIRECT(txm) &&
+		   txm->nb_segs == 1 &&
+		   rte_pktmbuf_headroom(txm) >= hdr_size;
+
+	slots = txm->nb_segs + !can_push;
+	need = slots - vq->vq_free_cnt;
+
+	/* Positive value indicates it need free vring descriptors */
+	if (unlikely(need > 0)) {
+		virtio_xmit_cleanup_inorder_packed(vq, need);
+		need = slots - vq->vq_free_cnt;
+		if (unlikely(need > 0)) {
+			PMD_TX_LOG(ERR,
+				   "No free tx descriptors to transmit");
+			return -1;
+		}
+	}
+
+	/* Enqueue Packet buffers */
+	virtqueue_enqueue_xmit_packed(txvq, txm, slots, can_push, 1);
+
+	txvq->stats.bytes += txm->pkt_len;
+	return 0;
+}
+
+/* Optionally fill offload information in structure */
+static inline int
+virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
+{
+	struct rte_net_hdr_lens hdr_lens;
+	uint32_t hdrlen, ptype;
+	int l4_supported = 0;
+
+	/* nothing to do */
+	if (hdr->flags == 0)
+		return 0;
+
+	/* GSO not support in vec path, skip check */
+	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
+
+	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
+	m->packet_type = ptype;
+	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
+		l4_supported = 1;
+
+	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
+		if (hdr->csum_start <= hdrlen && l4_supported) {
+			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
+		} else {
+			/* Unknown proto or tunnel, do sw cksum. We can assume
+			 * the cksum field is in the first segment since the
+			 * buffers we provided to the host are large enough.
+			 * In case of SCTP, this will be wrong since it's a CRC
+			 * but there's nothing we can do.
+			 */
+			uint16_t csum = 0, off;
+
+			rte_raw_cksum_mbuf(m, hdr->csum_start,
+				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
+				&csum);
+			if (likely(csum != 0xffff))
+				csum = ~csum;
+			off = hdr->csum_offset + hdr->csum_start;
+			if (rte_pktmbuf_data_len(m) >= off + 1)
+				*rte_pktmbuf_mtod_offset(m, uint16_t *,
+					off) = csum;
+		}
+	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
+		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+	}
+
+	return 0;
+}
+
+static inline uint16_t
+virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
+				    struct rte_mbuf **rx_pkts)
+{
+	uint16_t used_idx, id;
+	uint32_t len;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint32_t hdr_size = hw->vtnet_hdr_size;
+	struct virtio_net_hdr *hdr;
+	struct vring_packed_desc *desc;
+	struct rte_mbuf *cookie;
+
+	desc = vq->vq_packed.ring.desc;
+	used_idx = vq->vq_used_cons_idx;
+	if (!desc_is_used(&desc[used_idx], vq))
+		return -1;
+
+	len = desc[used_idx].len;
+	id = desc[used_idx].id;
+	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
+	if (unlikely(cookie == NULL)) {
+		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
+				vq->vq_used_cons_idx);
+		return -1;
+	}
+	rte_prefetch0(cookie);
+	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
+
+	cookie->data_off = RTE_PKTMBUF_HEADROOM;
+	cookie->ol_flags = 0;
+	cookie->pkt_len = (uint32_t)(len - hdr_size);
+	cookie->data_len = (uint32_t)(len - hdr_size);
+
+	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
+					RTE_PKTMBUF_HEADROOM - hdr_size);
+	if (hw->has_rx_offload)
+		virtio_vec_rx_offload(cookie, hdr);
+
+	*rx_pkts = cookie;
+
+	rxvq->stats.bytes += cookie->pkt_len;
+
+	vq->vq_free_cnt++;
+	vq->vq_used_cons_idx++;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
+static inline void
+virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
+			      struct rte_mbuf **cookie,
+			      uint16_t num)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
+	uint16_t flags = vq->vq_packed.cached_flags;
+	struct virtio_hw *hw = vq->hw;
+	struct vq_desc_extra *dxp;
+	uint16_t idx, i;
+	uint16_t batch_num, total_num = 0;
+	uint16_t head_idx = vq->vq_avail_idx;
+	uint16_t head_flag = vq->vq_packed.cached_flags;
+	uint64_t addr;
+
+	do {
+		idx = vq->vq_avail_idx;
+
+		batch_num = PACKED_BATCH_SIZE;
+		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
+			batch_num = vq->vq_nentries - idx;
+		if (unlikely((total_num + batch_num) > num))
+			batch_num = num - total_num;
+
+		virtio_for_each_try_unroll(i, 0, batch_num) {
+			dxp = &vq->vq_descx[idx + i];
+			dxp->cookie = (void *)cookie[total_num + i];
+
+			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
+				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
+			start_dp[idx + i].addr = addr;
+			start_dp[idx + i].len = cookie[total_num + i]->buf_len
+				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
+			if (total_num || i) {
+				virtqueue_store_flags_packed(&start_dp[idx + i],
+						flags, hw->weak_barriers);
+			}
+		}
+
+		vq->vq_avail_idx += batch_num;
+		if (vq->vq_avail_idx >= vq->vq_nentries) {
+			vq->vq_avail_idx -= vq->vq_nentries;
+			vq->vq_packed.cached_flags ^=
+				VRING_PACKED_DESC_F_AVAIL_USED;
+			flags = vq->vq_packed.cached_flags;
+		}
+		total_num += batch_num;
+	} while (total_num < num);
+
+	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
+				hw->weak_barriers);
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
+}
+
+#endif /* _VIRTIO_RXTX_PACKED_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c b/drivers/net/virtio/virtio_rxtx_packed_avx.c
index 6a8214725..c8fbb8f2c 100644
--- a/drivers/net/virtio/virtio_rxtx_packed_avx.c
+++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
@@ -13,71 +13,9 @@
 #include "virtio_logs.h"
 #include "virtio_ethdev.h"
 #include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
 #include "virtqueue.h"
 
-#define BYTE_SIZE 8
-/* flag bits offset in packed ring desc higher 64bits */
-#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
-	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
-
-#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
-	FLAGS_BITS_OFFSET)
-
-/* reference count offset in mbuf rearm data */
-#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
-	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
-/* segment number offset in mbuf rearm data */
-#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
-	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
-
-/* default rearm data */
-#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
-	1ULL << REFCNT_BITS_OFFSET)
-
-/* id bits offset in packed ring desc higher 64bits */
-#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
-	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
-
-/* net hdr short size mask */
-#define NET_HDR_MASK 0x3F
-
-#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
-	sizeof(struct vring_packed_desc))
-#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
-
-#ifdef VIRTIO_GCC_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifdef VIRTIO_ICC_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifndef virtio_for_each_try_unroll
-#define virtio_for_each_try_unroll(iter, val, num) \
-	for (iter = val; iter < num; iter++)
-#endif
-
-static inline void
-virtio_update_batch_stats(struct virtnet_stats *stats,
-			  uint16_t pkt_len1,
-			  uint16_t pkt_len2,
-			  uint16_t pkt_len3,
-			  uint16_t pkt_len4)
-{
-	stats->bytes += pkt_len1;
-	stats->bytes += pkt_len2;
-	stats->bytes += pkt_len3;
-	stats->bytes += pkt_len4;
-}
-
 static inline int
 virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
 				   struct rte_mbuf **tx_pkts)
@@ -200,46 +138,6 @@ virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
 	return 0;
 }
 
-static inline int
-virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq,
-				    struct rte_mbuf *txm)
-{
-	struct virtqueue *vq = txvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t hdr_size = hw->vtnet_hdr_size;
-	uint16_t slots, can_push;
-	int16_t need;
-
-	/* How many main ring entries are needed to this Tx?
-	 * any_layout => number of segments
-	 * default    => number of segments + 1
-	 */
-	can_push = rte_mbuf_refcnt_read(txm) == 1 &&
-		   RTE_MBUF_DIRECT(txm) &&
-		   txm->nb_segs == 1 &&
-		   rte_pktmbuf_headroom(txm) >= hdr_size;
-
-	slots = txm->nb_segs + !can_push;
-	need = slots - vq->vq_free_cnt;
-
-	/* Positive value indicates it need free vring descriptors */
-	if (unlikely(need > 0)) {
-		virtio_xmit_cleanup_inorder_packed(vq, need);
-		need = slots - vq->vq_free_cnt;
-		if (unlikely(need > 0)) {
-			PMD_TX_LOG(ERR,
-				   "No free tx descriptors to transmit");
-			return -1;
-		}
-	}
-
-	/* Enqueue Packet buffers */
-	virtqueue_enqueue_xmit_packed(txvq, txm, slots, can_push, 1);
-
-	txvq->stats.bytes += txm->pkt_len;
-	return 0;
-}
-
 uint16_t
 virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 			uint16_t nb_pkts)
@@ -293,58 +191,6 @@ virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 	return nb_tx;
 }
 
-/* Optionally fill offload information in structure */
-static inline int
-virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
-{
-	struct rte_net_hdr_lens hdr_lens;
-	uint32_t hdrlen, ptype;
-	int l4_supported = 0;
-
-	/* nothing to do */
-	if (hdr->flags == 0)
-		return 0;
-
-	/* GSO not support in vec path, skip check */
-	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
-
-	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
-	m->packet_type = ptype;
-	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
-	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
-	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
-		l4_supported = 1;
-
-	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
-		if (hdr->csum_start <= hdrlen && l4_supported) {
-			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
-		} else {
-			/* Unknown proto or tunnel, do sw cksum. We can assume
-			 * the cksum field is in the first segment since the
-			 * buffers we provided to the host are large enough.
-			 * In case of SCTP, this will be wrong since it's a CRC
-			 * but there's nothing we can do.
-			 */
-			uint16_t csum = 0, off;
-
-			rte_raw_cksum_mbuf(m, hdr->csum_start,
-				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
-				&csum);
-			if (likely(csum != 0xffff))
-				csum = ~csum;
-			off = hdr->csum_offset + hdr->csum_start;
-			if (rte_pktmbuf_data_len(m) >= off + 1)
-				*rte_pktmbuf_mtod_offset(m, uint16_t *,
-					off) = csum;
-		}
-	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
-		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
-	}
-
-	return 0;
-}
-
 static inline uint16_t
 virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
 				   struct rte_mbuf **rx_pkts)
@@ -445,114 +291,6 @@ virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
 	return 0;
 }
 
-static uint16_t
-virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
-				    struct rte_mbuf **rx_pkts)
-{
-	uint16_t used_idx, id;
-	uint32_t len;
-	struct virtqueue *vq = rxvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint32_t hdr_size = hw->vtnet_hdr_size;
-	struct virtio_net_hdr *hdr;
-	struct vring_packed_desc *desc;
-	struct rte_mbuf *cookie;
-
-	desc = vq->vq_packed.ring.desc;
-	used_idx = vq->vq_used_cons_idx;
-	if (!desc_is_used(&desc[used_idx], vq))
-		return -1;
-
-	len = desc[used_idx].len;
-	id = desc[used_idx].id;
-	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
-	if (unlikely(cookie == NULL)) {
-		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
-				vq->vq_used_cons_idx);
-		return -1;
-	}
-	rte_prefetch0(cookie);
-	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
-
-	cookie->data_off = RTE_PKTMBUF_HEADROOM;
-	cookie->ol_flags = 0;
-	cookie->pkt_len = (uint32_t)(len - hdr_size);
-	cookie->data_len = (uint32_t)(len - hdr_size);
-
-	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
-					RTE_PKTMBUF_HEADROOM - hdr_size);
-	if (hw->has_rx_offload)
-		virtio_vec_rx_offload(cookie, hdr);
-
-	*rx_pkts = cookie;
-
-	rxvq->stats.bytes += cookie->pkt_len;
-
-	vq->vq_free_cnt++;
-	vq->vq_used_cons_idx++;
-	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
-		vq->vq_used_cons_idx -= vq->vq_nentries;
-		vq->vq_packed.used_wrap_counter ^= 1;
-	}
-
-	return 0;
-}
-
-static inline void
-virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
-			      struct rte_mbuf **cookie,
-			      uint16_t num)
-{
-	struct virtqueue *vq = rxvq->vq;
-	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
-	uint16_t flags = vq->vq_packed.cached_flags;
-	struct virtio_hw *hw = vq->hw;
-	struct vq_desc_extra *dxp;
-	uint16_t idx, i;
-	uint16_t batch_num, total_num = 0;
-	uint16_t head_idx = vq->vq_avail_idx;
-	uint16_t head_flag = vq->vq_packed.cached_flags;
-	uint64_t addr;
-
-	do {
-		idx = vq->vq_avail_idx;
-
-		batch_num = PACKED_BATCH_SIZE;
-		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
-			batch_num = vq->vq_nentries - idx;
-		if (unlikely((total_num + batch_num) > num))
-			batch_num = num - total_num;
-
-		virtio_for_each_try_unroll(i, 0, batch_num) {
-			dxp = &vq->vq_descx[idx + i];
-			dxp->cookie = (void *)cookie[total_num + i];
-
-			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
-				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
-			start_dp[idx + i].addr = addr;
-			start_dp[idx + i].len = cookie[total_num + i]->buf_len
-				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
-			if (total_num || i) {
-				virtqueue_store_flags_packed(&start_dp[idx + i],
-						flags, hw->weak_barriers);
-			}
-		}
-
-		vq->vq_avail_idx += batch_num;
-		if (vq->vq_avail_idx >= vq->vq_nentries) {
-			vq->vq_avail_idx -= vq->vq_nentries;
-			vq->vq_packed.cached_flags ^=
-				VRING_PACKED_DESC_F_AVAIL_USED;
-			flags = vq->vq_packed.cached_flags;
-		}
-		total_num += batch_num;
-	} while (total_num < num);
-
-	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
-				hw->weak_barriers);
-	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
-}
-
 uint16_t
 virtio_recv_pkts_packed_vec(void *rx_queue,
 			    struct rte_mbuf **rx_pkts,
-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON path
  2020-09-11 12:09 [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Joyce Kong
  2020-09-11 12:09 ` [dpdk-dev] [RFC 1/3] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
@ 2020-09-11 12:09 ` Joyce Kong
  2020-09-11 12:09 ` [dpdk-dev] [RFC 3/3] net/virtio: add election for packed vector " Joyce Kong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2020-09-11 12:09 UTC (permalink / raw)
  To: maxime.coquelin
  Cc: jerinj, dev, nd, honnappa.nagarahalli, ruifeng.wang, phil.yang

Optimize packed ring Rx batch path with NEON instructions.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 drivers/net/virtio/meson.build               |   1 +
 drivers/net/virtio/virtio_rxtx.c             |   7 +-
 drivers/net/virtio/virtio_rxtx_packed.h      |  16 ++
 drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++++++++
 4 files changed, 224 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c

diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
index e1851b0a6..5af633686 100644
--- a/drivers/net/virtio/meson.build
+++ b/drivers/net/virtio/meson.build
@@ -34,6 +34,7 @@ elif arch_subdir == 'ppc'
 	sources += files('virtio_rxtx_simple_altivec.c')
 elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64')
 	sources += files('virtio_rxtx_simple_neon.c')
+	sources += files('virtio_rxtx_packed_neon.c')
 endif
 
 if is_linux
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index f915b8a2c..1deb77569 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -2020,7 +2020,8 @@ virtio_xmit_pkts_inorder(void *tx_queue,
 	return nb_tx;
 }
 
-#ifndef CC_AVX512_SUPPORT
+#if !defined(CC_AVX512_SUPPORT) && !defined(RTE_ARCH_ARM) && \
+	!defined(RTE_ARCH_ARM64)
 uint16_t
 virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
 			    struct rte_mbuf **rx_pkts __rte_unused,
@@ -2028,7 +2029,9 @@ virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
 {
 	return 0;
 }
+#endif
 
+#if !defined(CC_AVX512_SUPPORT)
 uint16_t
 virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,
 			    struct rte_mbuf **tx_pkts __rte_unused,
@@ -2036,4 +2039,4 @@ virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,
 {
 	return 0;
 }
-#endif /* ifndef CC_AVX512_SUPPORT */
+#endif
diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
index b2447843b..fd2d6baa5 100644
--- a/drivers/net/virtio/virtio_rxtx_packed.h
+++ b/drivers/net/virtio/virtio_rxtx_packed.h
@@ -19,9 +19,16 @@
 #include "virtqueue.h"
 
 #define BYTE_SIZE 8
+
+#if defined(AVX512_SUPPORT)
 /* flag bits offset in packed ring desc higher 64bits */
 #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
 	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+/* flag bits offset in packed ring desc 32bits */
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
+#endif
 
 #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
 	FLAGS_BITS_OFFSET)
@@ -44,8 +51,17 @@
 /* net hdr short size mask */
 #define NET_HDR_MASK 0x3F
 
+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+/* The cache line size on different aarh64 platforms are
+ * different, so put a four batch size here to match with
+ * the minimum cache line size.
+ */
+#define PACKED_BATCH_SIZE 4
+#else
 #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
 	sizeof(struct vring_packed_desc))
+#endif
+
 #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
 
 #ifdef VIRTIO_GCC_UNROLL_PRAGMA
diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.c b/drivers/net/virtio/virtio_rxtx_packed_neon.c
new file mode 100644
index 000000000..182afe5c6
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed_neon.c
@@ -0,0 +1,202 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+#include <rte_vect.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+static inline uint16_t
+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
+				   struct rte_mbuf **rx_pkts)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t head_size = hw->vtnet_hdr_size;
+	uint16_t id = vq->vq_used_cons_idx;
+	struct vring_packed_desc *p_desc;
+	uint16_t i;
+
+	if (id & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* Map packed descriptor to mbuf fields. */
+	uint8x16_t shuf_msk1 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
+		0, 1,			/* octet 1~0, low 16 bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
+		0, 1,			/* octet 1~0, 16 bits data_len */
+		0xFF, 0xFF,		/* vlan tci set as unknown */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	uint8x16_t shuf_msk2 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
+		8, 9,			/* octet 9~8, low 16 bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
+		8, 9,			/* octet 9~8, 16 bits data_len */
+		0xFF, 0xFF,		/* vlan tci set as unknown */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	/* Subtract the header length. */
+	uint16x8_t len_adjust = {
+		0, 0,		/* ignore pkt_type field */
+		head_size,	/* sub head_size on pkt_len */
+		0,		/* ignore high 16 bits of pkt_len */
+		head_size,	/* sub head_size on data_len */
+		0, 0, 0		/* ignore non-length fields */
+	};
+
+	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
+	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
+	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
+
+	p_desc = &vq->vq_packed.ring.desc[id];
+	/* Load packed descriptor 0,1. */
+	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
+	/* Load packed descriptor 2,3. */
+	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
+
+	/* Only care avail/used bits. */
+	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
+	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
+				vreinterpretq_u32_u64(desc[1]));
+	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
+
+	uint32x4_t v_used_flag = vdupq_n_u32(0);
+	if (vq->vq_packed.used_wrap_counter)
+		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
+
+	poly128_t desc_stats = vreinterpretq_p128_u32(vceqq_u32(v_flag,
+					v_used_flag));
+
+	/* Check all descs are used. */
+	if (!desc_stats)
+		return -1;
+
+	/* Load 2 mbuf pointers per time. */
+	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
+	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
+
+	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
+	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
+
+	/**
+	 *  Update data length and packet length for descriptor.
+	 *  structure of pkt_mb:
+	 *  --------------------------------------------------------------------
+	 *  |4 octet pkt_type|4 octet pkt_len|2 octet data_len|2 octet vlan_tci|
+	 *  --------------------------------------------------------------------
+	 */
+	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk1));
+	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+
+	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+
+	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
+	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
+	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
+	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
+
+	if (hw->has_rx_offload) {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			char *addr = (char *)rx_pkts[i]->buf_addr +
+				RTE_PKTMBUF_HEADROOM - head_size;
+			virtio_vec_rx_offload(rx_pkts[i],
+					(struct virtio_net_hdr *)addr);
+		}
+	}
+
+	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
+			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
+			rx_pkts[3]->pkt_len);
+
+	vq->vq_free_cnt += PACKED_BATCH_SIZE;
+
+	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
+uint16_t
+virtio_recv_pkts_packed_vec(void *rx_queue,
+			    struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t num, nb_rx = 0;
+	uint32_t nb_enqueued = 0;
+	uint16_t free_cnt = vq->vq_free_thresh;
+
+	if (unlikely(hw->started == 0))
+		return nb_rx;
+
+	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
+	if (likely(num > PACKED_BATCH_SIZE))
+		num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);
+
+	while (num) {
+		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx += PACKED_BATCH_SIZE;
+			num -= PACKED_BATCH_SIZE;
+			continue;
+		}
+		if (!virtqueue_dequeue_single_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx++;
+			num--;
+			continue;
+		}
+		break;
+	};
+
+	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
+
+	rxvq->stats.packets += nb_rx;
+
+	if (likely(vq->vq_free_cnt >= free_cnt)) {
+		struct rte_mbuf *new_pkts[free_cnt];
+		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
+						free_cnt) == 0)) {
+			virtio_recv_refill_packed_vec(rxvq, new_pkts,
+					free_cnt);
+			nb_enqueued += free_cnt;
+		} else {
+			struct rte_eth_dev *dev =
+				&rte_eth_devices[rxvq->port_id];
+			dev->data->rx_mbuf_alloc_failed += free_cnt;
+		}
+	}
+
+	if (likely(nb_enqueued)) {
+		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
+			virtqueue_notify(vq);
+			PMD_RX_LOG(DEBUG, "Notified");
+		}
+	}
+
+	return nb_rx;
+}
-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [RFC 3/3] net/virtio: add election for packed vector Rx NEON path
  2020-09-11 12:09 [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Joyce Kong
  2020-09-11 12:09 ` [dpdk-dev] [RFC 1/3] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
  2020-09-11 12:09 ` [dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
@ 2020-09-11 12:09 ` Joyce Kong
  2020-10-05  7:34 ` [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Maxime Coquelin
  2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
  4 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2020-09-11 12:09 UTC (permalink / raw)
  To: maxime.coquelin
  Cc: jerinj, dev, nd, honnappa.nagarahalli, ruifeng.wang, phil.yang

Add NEON vectorized path selection logic. Default setting comes from
vectorized devarg, then checks each criteria.

Packed ring vectorized neon path need:
    NEON is supported by compiler and host
    VERSION_1 and IN_ORDER features are negotiated
    mergeable feature is not negotiated
    LRO offloading is disabled

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 doc/guides/nics/virtio.rst              |  4 ++--
 drivers/net/virtio/virtio_ethdev.c      | 19 +++++++++++++++----
 drivers/net/virtio/virtio_user_ethdev.c |  2 ++
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/doc/guides/nics/virtio.rst b/doc/guides/nics/virtio.rst
index 0daf25b22..fe9586699 100644
--- a/doc/guides/nics/virtio.rst
+++ b/doc/guides/nics/virtio.rst
@@ -483,8 +483,8 @@ according to below configuration:
 #. Packed virtqueue in-order non-mergeable path: If in-order feature is negotiated and
    Rx mergeable is not negotiated, this path will be selected.
 #. Packed virtqueue vectorized Rx path: If building and running environment support
-   AVX512 && in-order feature is negotiated && Rx mergeable is not negotiated &&
-   TCP_LRO Rx offloading is disabled && vectorized option enabled,
+   (AVX512 || ARCH_ARM || ARCH_ARM64) && in-order feature is negotiated && Rx mergeable
+   is not negotiated && TCP_LRO Rx offloading is disabled && vectorized option enabled,
    this path will be selected.
 #. Packed virtqueue vectorized Tx path: If building and running environment support
    AVX512 && in-order feature is negotiated && vectorized option enabled,
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index dc0093bdf..b36ea98cf 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -1958,12 +1958,14 @@ eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
 		if (!vtpci_packed_queue(hw)) {
 			hw->use_vec_rx = 1;
 		} else {
-#if !defined(CC_AVX512_SUPPORT)
-			PMD_DRV_LOG(INFO,
-				"building environment do not support packed ring vectorized");
-#else
+#if defined(CC_AVX512_SUPPORT)
 			hw->use_vec_rx = 1;
 			hw->use_vec_tx = 1;
+#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+			hw->use_vec_rx = 1;
+#else
+			PMD_DRV_LOG(INFO,
+				"building environment do not support packed ring vectorized");
 #endif
 		}
 	}
@@ -2311,6 +2313,15 @@ virtio_dev_configure(struct rte_eth_dev *dev)
 			hw->use_vec_rx = 0;
 			hw->use_vec_tx = 0;
 		}
+#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+		if (hw->use_vec_rx &&
+		    (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) ||
+		     !vtpci_with_feature(hw, VIRTIO_F_IN_ORDER) ||
+		     !vtpci_with_feature(hw, VIRTIO_F_VERSION_1))) {
+			PMD_DRV_LOG(INFO,
+				"disabled packed ring vectorized path for requirements not met");
+			hw->use_vec_rx = 0;
+		}
 #else
 		hw->use_vec_rx = 0;
 		hw->use_vec_tx = 0;
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 6003f6d50..1cfeb388f 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -766,6 +766,8 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
 #if defined(CC_AVX512_SUPPORT)
 			hw->use_vec_rx = 1;
 			hw->use_vec_tx = 1;
+#elif defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+			hw->use_vec_rx = 1;
 #else
 			PMD_INIT_LOG(INFO,
 				"building environment do not support packed ring vectorized");
-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON
  2020-09-11 12:09 [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Joyce Kong
                   ` (2 preceding siblings ...)
  2020-09-11 12:09 ` [dpdk-dev] [RFC 3/3] net/virtio: add election for packed vector " Joyce Kong
@ 2020-10-05  7:34 ` Maxime Coquelin
  2020-10-08  6:54   ` Joyce Kong
  2020-10-15  9:01   ` Ruifeng Wang
  2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
  4 siblings, 2 replies; 26+ messages in thread
From: Maxime Coquelin @ 2020-10-05  7:34 UTC (permalink / raw)
  To: Joyce Kong; +Cc: jerinj, dev, nd, honnappa.nagarahalli, ruifeng.wang, phil.yang

Hi Joyce,

On 9/11/20 2:09 PM, Joyce Kong wrote:
> This patch set introduced vectorized RX path for packed ring
> with NEON.

Overall, the series looks good. I will have to trust you on
the NEON implementation. Ideally, having a detailed review from
someone from ARM would be good.

Could you please send a new revision with the build issues reported by
Intel CI fixed?

http://mails.dpdk.org/archives/test-report/2020-September/152501.html

Thanks,
Maxime


> Joyce Kong (3):
>   net/virtio: move AVX based Rx and Tx code to separate file
>   net/virtio: add vectorized packed ring Rx NEON path
>   net/virtio: add election for packed vector Rx NEON path
> 
>  doc/guides/nics/virtio.rst                   |   4 +-
>  drivers/net/virtio/meson.build               |   2 +
>  drivers/net/virtio/virtio_ethdev.c           |  19 +-
>  drivers/net/virtio/virtio_rxtx.c             |   7 +-
>  drivers/net/virtio/virtio_rxtx_packed.c      |  37 +++
>  drivers/net/virtio/virtio_rxtx_packed.h      | 300 +++++++++++++++++++
>  drivers/net/virtio/virtio_rxtx_packed_avx.c  | 264 +---------------
>  drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++
>  drivers/net/virtio/virtio_user_ethdev.c      |   2 +
>  9 files changed, 566 insertions(+), 271 deletions(-)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON
  2020-10-05  7:34 ` [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Maxime Coquelin
@ 2020-10-08  6:54   ` Joyce Kong
  2020-10-15  9:01   ` Ruifeng Wang
  1 sibling, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2020-10-08  6:54 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: jerinj, dev, nd, Honnappa Nagarahalli, Ruifeng Wang, Phil Yang

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Monday, October 5, 2020 3:34 PM
> To: Joyce Kong <Joyce.Kong@arm.com>
> Cc: jerinj@marvell.com; dev@dpdk.org; nd <nd@arm.com>; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>
> Subject: Re: [RFC 0/3] Vectorize packed ring RX path with NEON
> 
> Hi Joyce,
> 
> On 9/11/20 2:09 PM, Joyce Kong wrote:
> > This patch set introduced vectorized RX path for packed ring with
> > NEON.
> 
> Overall, the series looks good. I will have to trust you on the NEON
> implementation. Ideally, having a detailed review from someone from ARM
> would be good.
> 
> Could you please send a new revision with the build issues reported by Intel
> CI fixed?
> 
> http://mails.dpdk.org/archives/test-report/2020-September/152501.html
> 
> Thanks,
> Maxime
> 

Thanks for your comments, and I will send the new revision soon.

> 
> > Joyce Kong (3):
> >   net/virtio: move AVX based Rx and Tx code to separate file
> >   net/virtio: add vectorized packed ring Rx NEON path
> >   net/virtio: add election for packed vector Rx NEON path
> >
> >  doc/guides/nics/virtio.rst                   |   4 +-
> >  drivers/net/virtio/meson.build               |   2 +
> >  drivers/net/virtio/virtio_ethdev.c           |  19 +-
> >  drivers/net/virtio/virtio_rxtx.c             |   7 +-
> >  drivers/net/virtio/virtio_rxtx_packed.c      |  37 +++
> >  drivers/net/virtio/virtio_rxtx_packed.h      | 300 +++++++++++++++++++
> >  drivers/net/virtio/virtio_rxtx_packed_avx.c  | 264 +---------------
> > drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++
> >  drivers/net/virtio/virtio_user_ethdev.c      |   2 +
> >  9 files changed, 566 insertions(+), 271 deletions(-)  create mode
> > 100644 drivers/net/virtio/virtio_rxtx_packed.c
> >  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
> >  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c
> >


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON
  2020-10-05  7:34 ` [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Maxime Coquelin
  2020-10-08  6:54   ` Joyce Kong
@ 2020-10-15  9:01   ` Ruifeng Wang
  2020-10-15  9:02     ` Maxime Coquelin
  1 sibling, 1 reply; 26+ messages in thread
From: Ruifeng Wang @ 2020-10-15  9:01 UTC (permalink / raw)
  To: Maxime Coquelin, Joyce Kong; +Cc: jerinj, dev, nd, Honnappa Nagarahalli, nd


> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Monday, October 5, 2020 3:34 PM
> To: Joyce Kong <Joyce.Kong@arm.com>
> Cc: jerinj@marvell.com; dev@dpdk.org; nd <nd@arm.com>; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>
> Subject: Re: [RFC 0/3] Vectorize packed ring RX path with NEON
> 
> Hi Joyce,
> 
> On 9/11/20 2:09 PM, Joyce Kong wrote:
> > This patch set introduced vectorized RX path for packed ring with
> > NEON.
> 
> Overall, the series looks good. I will have to trust you on the NEON
> implementation. Ideally, having a detailed review from someone from ARM
> would be good.
This patch doesn’t target 20.11. New version will be sent out but not in short time.
You can skip this one for 20.11 release.

Thanks.
> 
> Could you please send a new revision with the build issues reported by Intel
> CI fixed?
> 
> http://mails.dpdk.org/archives/test-report/2020-September/152501.html
> 
> Thanks,
> Maxime
> 
> 
> > Joyce Kong (3):
> >   net/virtio: move AVX based Rx and Tx code to separate file
> >   net/virtio: add vectorized packed ring Rx NEON path
> >   net/virtio: add election for packed vector Rx NEON path
> >
> >  doc/guides/nics/virtio.rst                   |   4 +-
> >  drivers/net/virtio/meson.build               |   2 +
> >  drivers/net/virtio/virtio_ethdev.c           |  19 +-
> >  drivers/net/virtio/virtio_rxtx.c             |   7 +-
> >  drivers/net/virtio/virtio_rxtx_packed.c      |  37 +++
> >  drivers/net/virtio/virtio_rxtx_packed.h      | 300 +++++++++++++++++++
> >  drivers/net/virtio/virtio_rxtx_packed_avx.c  | 264 +---------------
> > drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++
> >  drivers/net/virtio/virtio_user_ethdev.c      |   2 +
> >  9 files changed, 566 insertions(+), 271 deletions(-)  create mode
> > 100644 drivers/net/virtio/virtio_rxtx_packed.c
> >  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
> >  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c
> >


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON
  2020-10-15  9:01   ` Ruifeng Wang
@ 2020-10-15  9:02     ` Maxime Coquelin
  0 siblings, 0 replies; 26+ messages in thread
From: Maxime Coquelin @ 2020-10-15  9:02 UTC (permalink / raw)
  To: Ruifeng Wang, Joyce Kong; +Cc: jerinj, dev, nd, Honnappa Nagarahalli



On 10/15/20 11:01 AM, Ruifeng Wang wrote:
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Monday, October 5, 2020 3:34 PM
>> To: Joyce Kong <Joyce.Kong@arm.com>
>> Cc: jerinj@marvell.com; dev@dpdk.org; nd <nd@arm.com>; Honnappa
>> Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang
>> <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>
>> Subject: Re: [RFC 0/3] Vectorize packed ring RX path with NEON
>>
>> Hi Joyce,
>>
>> On 9/11/20 2:09 PM, Joyce Kong wrote:
>>> This patch set introduced vectorized RX path for packed ring with
>>> NEON.
>>
>> Overall, the series looks good. I will have to trust you on the NEON
>> implementation. Ideally, having a detailed review from someone from ARM
>> would be good.
> This patch doesn’t target 20.11. New version will be sent out but not in short time.
> You can skip this one for 20.11 release.

Thanks Ruifeng for the update.
I'll mark the series as postponed then.

Maxime

> Thanks.
>>
>> Could you please send a new revision with the build issues reported by Intel
>> CI fixed?
>>
>> http://mails.dpdk.org/archives/test-report/2020-September/152501.html
>>
>> Thanks,
>> Maxime
>>
>>
>>> Joyce Kong (3):
>>>   net/virtio: move AVX based Rx and Tx code to separate file
>>>   net/virtio: add vectorized packed ring Rx NEON path
>>>   net/virtio: add election for packed vector Rx NEON path
>>>
>>>  doc/guides/nics/virtio.rst                   |   4 +-
>>>  drivers/net/virtio/meson.build               |   2 +
>>>  drivers/net/virtio/virtio_ethdev.c           |  19 +-
>>>  drivers/net/virtio/virtio_rxtx.c             |   7 +-
>>>  drivers/net/virtio/virtio_rxtx_packed.c      |  37 +++
>>>  drivers/net/virtio/virtio_rxtx_packed.h      | 300 +++++++++++++++++++
>>>  drivers/net/virtio/virtio_rxtx_packed_avx.c  | 264 +---------------
>>> drivers/net/virtio/virtio_rxtx_packed_neon.c | 202 +++++++++++++
>>>  drivers/net/virtio/virtio_user_ethdev.c      |   2 +
>>>  9 files changed, 566 insertions(+), 271 deletions(-)  create mode
>>> 100644 drivers/net/virtio/virtio_rxtx_packed.c
>>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
>>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.c
>>>
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX path with NEON
  2020-09-11 12:09 [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Joyce Kong
                   ` (3 preceding siblings ...)
  2020-10-05  7:34 ` [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Maxime Coquelin
@ 2020-11-17 10:06 ` Joyce Kong
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
                     ` (4 more replies)
  4 siblings, 5 replies; 26+ messages in thread
From: Joyce Kong @ 2020-11-17 10:06 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd

This patch set introduces vectorized RX/TX path for packed ring with NEON
intrinsics.

With this patch set, PVP case has 1.5% perf uplift for the packed vectorized
NEON path compared with the non-vector packed path, under 0.001% acceptable
loss with 2 cores on vhost side and 1 core on virtio side.

Joyce Kong (4):
  net/virtio: move AVX based Rx and Tx code to separate file
  net/virtio: add vectorized packed ring Rx NEON path
  net/virtio: add vectorized packed ring Tx NEON path
  net/virtio: add election for packed vector NEON path

 doc/guides/nics/virtio.rst                   |   6 +-
 drivers/net/virtio/meson.build               |   5 +-
 drivers/net/virtio/virtio_ethdev.c           |  19 +-
 drivers/net/virtio/virtio_rxtx.c             |   6 +-
 drivers/net/virtio/virtio_rxtx_packed.c      | 139 ++++
 drivers/net/virtio/virtio_rxtx_packed.h      | 317 ++++++++++
 drivers/net/virtio/virtio_rxtx_packed_avx.c  | 626 -------------------
 drivers/net/virtio/virtio_rxtx_packed_avx.h  | 239 +++++++
 drivers/net/virtio/virtio_rxtx_packed_neon.h | 293 +++++++++
 drivers/net/virtio/virtio_user_ethdev.c      |   2 +-
 10 files changed, 1012 insertions(+), 640 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
 delete mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h

-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file
  2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
@ 2020-11-17 10:06   ` Joyce Kong
  2021-01-05 14:06     ` Maxime Coquelin
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 26+ messages in thread
From: Joyce Kong @ 2020-11-17 10:06 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd

Split out AVX instruction based virtio packed ring Rx and Tx
implementation to a separate file.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 drivers/net/virtio/meson.build              |   4 +-
 drivers/net/virtio/virtio_rxtx.c            |   6 +-
 drivers/net/virtio/virtio_rxtx_packed.c     | 137 +++++
 drivers/net/virtio/virtio_rxtx_packed.h     | 298 ++++++++++
 drivers/net/virtio/virtio_rxtx_packed_avx.c | 626 --------------------
 drivers/net/virtio/virtio_rxtx_packed_avx.h | 239 ++++++++
 6 files changed, 678 insertions(+), 632 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
 delete mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.h

diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
index eaed46373..01b8de6d4 100644
--- a/drivers/net/virtio/meson.build
+++ b/drivers/net/virtio/meson.build
@@ -13,12 +13,12 @@ if arch_subdir == 'x86'
 		if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
 			cflags += ['-DCC_AVX512_SUPPORT']
 			virtio_avx512_lib = static_library('virtio_avx512_lib',
-					      'virtio_rxtx_packed_avx.c',
+					      'virtio_rxtx_packed.c',
 					      dependencies: [static_rte_ethdev,
 						static_rte_kvargs, static_rte_bus_pci],
 					      include_directories: includes,
 					      c_args: [cflags, '-mavx512f', '-mavx512bw', '-mavx512vl'])
-			objs += virtio_avx512_lib.extract_objects('virtio_rxtx_packed_avx.c')
+			objs += virtio_avx512_lib.extract_objects('virtio_rxtx_packed.c')
 			if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
 				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
 			elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 77934e8c5..622d4bf20 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -2025,8 +2025,7 @@ virtio_xmit_pkts_inorder(void *tx_queue,
 	return nb_tx;
 }
 
-#ifndef CC_AVX512_SUPPORT
-uint16_t
+__rte_weak uint16_t
 virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
 			    struct rte_mbuf **rx_pkts __rte_unused,
 			    uint16_t nb_pkts __rte_unused)
@@ -2034,11 +2033,10 @@ virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
 	return 0;
 }
 
-uint16_t
+__rte_weak uint16_t
 virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused,
 			    struct rte_mbuf **tx_pkts __rte_unused,
 			    uint16_t nb_pkts __rte_unused)
 {
 	return 0;
 }
-#endif /* ifndef CC_AVX512_SUPPORT */
diff --git a/drivers/net/virtio/virtio_rxtx_packed.c b/drivers/net/virtio/virtio_rxtx_packed.c
new file mode 100644
index 000000000..99d9a5a99
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed.c
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+#ifdef CC_AVX512_SUPPORT
+#include "virtio_rxtx_packed_avx.h"
+#endif
+
+uint16_t
+virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+			uint16_t nb_pkts)
+{
+	struct virtnet_tx *txvq = tx_queue;
+	struct virtqueue *vq = txvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t nb_tx = 0;
+	uint16_t remained;
+
+	if (unlikely(hw->started == 0 && tx_pkts != hw->inject_pkts))
+		return nb_tx;
+
+	if (unlikely(nb_pkts < 1))
+		return nb_pkts;
+
+	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
+
+	if (vq->vq_free_cnt <= vq->vq_nentries - vq->vq_free_thresh)
+		virtio_xmit_cleanup_inorder_packed(vq, vq->vq_free_thresh);
+
+	remained = RTE_MIN(nb_pkts, vq->vq_free_cnt);
+
+	while (remained) {
+		if (remained >= PACKED_BATCH_SIZE) {
+			if (!virtqueue_enqueue_batch_packed_vec(txvq,
+						&tx_pkts[nb_tx])) {
+				nb_tx += PACKED_BATCH_SIZE;
+				remained -= PACKED_BATCH_SIZE;
+				continue;
+			}
+		}
+		if (!virtqueue_enqueue_single_packed_vec(txvq,
+					tx_pkts[nb_tx])) {
+			nb_tx++;
+			remained--;
+			continue;
+		}
+		break;
+	};
+
+	txvq->stats.packets += nb_tx;
+
+	if (likely(nb_tx)) {
+		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
+			virtqueue_notify(vq);
+			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
+		}
+	}
+
+	return nb_tx;
+}
+
+uint16_t
+virtio_recv_pkts_packed_vec(void *rx_queue,
+			    struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t num, nb_rx = 0;
+	uint32_t nb_enqueued = 0;
+	uint16_t free_cnt = vq->vq_free_thresh;
+
+	if (unlikely(hw->started == 0))
+		return nb_rx;
+
+	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
+	if (likely(num > PACKED_BATCH_SIZE))
+		num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);
+
+	while (num) {
+		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx += PACKED_BATCH_SIZE;
+			num -= PACKED_BATCH_SIZE;
+			continue;
+		}
+		if (!virtqueue_dequeue_single_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx++;
+			num--;
+			continue;
+		}
+		break;
+	};
+
+	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
+
+	rxvq->stats.packets += nb_rx;
+
+	if (likely(vq->vq_free_cnt >= free_cnt)) {
+		struct rte_mbuf *new_pkts[free_cnt];
+		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
+						free_cnt) == 0)) {
+			virtio_recv_refill_packed_vec(rxvq, new_pkts,
+					free_cnt);
+			nb_enqueued += free_cnt;
+		} else {
+			struct rte_eth_dev *dev =
+				&rte_eth_devices[rxvq->port_id];
+			dev->data->rx_mbuf_alloc_failed += free_cnt;
+		}
+	}
+
+	if (likely(nb_enqueued)) {
+		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
+			virtqueue_notify(vq);
+			PMD_RX_LOG(DEBUG, "Notified");
+		}
+	}
+
+	return nb_rx;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
new file mode 100644
index 000000000..b0b1d63ec
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed.h
@@ -0,0 +1,298 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _VIRTIO_RXTX_PACKED_H_
+#define _VIRTIO_RXTX_PACKED_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+#define BYTE_SIZE 8
+/* flag bits offset in packed ring desc higher 64bits */
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
+	FLAGS_BITS_OFFSET)
+
+/* reference count offset in mbuf rearm data */
+#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+/* segment number offset in mbuf rearm data */
+#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+
+/* default rearm data */
+#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
+	1ULL << REFCNT_BITS_OFFSET)
+
+/* id bits offset in packed ring desc higher 64bits */
+#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+/* net hdr short size mask */
+#define NET_HDR_MASK 0x3F
+
+#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
+	sizeof(struct vring_packed_desc))
+#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
+
+#ifdef VIRTIO_GCC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_ICC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifndef virtio_for_each_try_unroll
+#define virtio_for_each_try_unroll(iter, val, size) \
+	for (iter = val; iter < size; iter++)
+#endif
+
+static inline void
+virtio_update_batch_stats(struct virtnet_stats *stats,
+			  uint16_t pkt_len1,
+			  uint16_t pkt_len2,
+			  uint16_t pkt_len3,
+			  uint16_t pkt_len4)
+{
+	stats->bytes += pkt_len1;
+	stats->bytes += pkt_len2;
+	stats->bytes += pkt_len3;
+	stats->bytes += pkt_len4;
+}
+
+static inline int
+virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq,
+				    struct rte_mbuf *txm)
+{
+	struct virtqueue *vq = txvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t hdr_size = hw->vtnet_hdr_size;
+	uint16_t slots, can_push = 0, use_indirect = 0;
+	int16_t need;
+
+	/* optimize ring usage */
+	if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
+	     vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
+	     rte_mbuf_refcnt_read(txm) == 1 && RTE_MBUF_DIRECT(txm) &&
+	     txm->nb_segs == 1 && rte_pktmbuf_headroom(txm) >= hdr_size)
+		can_push = 1;
+	else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
+		 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
+		use_indirect = 1;
+
+	/* How many main ring entries are needed to this Tx?
+	 * indirect   => 1
+	 * any_layout => number of segments
+	 * default    => number of segments + 1
+	 */
+	slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
+	can_push = rte_mbuf_refcnt_read(txm) == 1 &&
+		   RTE_MBUF_DIRECT(txm) &&
+		   txm->nb_segs == 1 &&
+		   rte_pktmbuf_headroom(txm) >= hdr_size;
+
+	slots = txm->nb_segs + !can_push;
+	need = slots - vq->vq_free_cnt;
+
+	/* Positive value indicates it need free vring descriptors */
+	if (unlikely(need > 0)) {
+		virtio_xmit_cleanup_inorder_packed(vq, need);
+		need = slots - vq->vq_free_cnt;
+		if (unlikely(need > 0)) {
+			PMD_TX_LOG(ERR,
+				   "No free tx descriptors to transmit");
+			return -1;
+		}
+	}
+
+	/* Enqueue Packet buffers */
+	virtqueue_enqueue_xmit_packed(txvq, txm, slots, use_indirect,
+				can_push, 1);
+
+	txvq->stats.bytes += txm->pkt_len;
+	return 0;
+}
+
+/* Optionally fill offload information in structure */
+static inline int
+virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
+{
+	struct rte_net_hdr_lens hdr_lens;
+	uint32_t hdrlen, ptype;
+	int l4_supported = 0;
+
+	/* nothing to do */
+	if (hdr->flags == 0)
+		return 0;
+
+	/* GSO not support in vec path, skip check */
+	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
+
+	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
+	m->packet_type = ptype;
+	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
+		l4_supported = 1;
+
+	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
+		if (hdr->csum_start <= hdrlen && l4_supported) {
+			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
+		} else {
+			/* Unknown proto or tunnel, do sw cksum. We can assume
+			 * the cksum field is in the first segment since the
+			 * buffers we provided to the host are large enough.
+			 * In case of SCTP, this will be wrong since it's a CRC
+			 * but there's nothing we can do.
+			 */
+			uint16_t csum = 0, off;
+
+			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
+				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
+				&csum) < 0)
+				return -1;
+			if (likely(csum != 0xffff))
+				csum = ~csum;
+			off = hdr->csum_offset + hdr->csum_start;
+			if (rte_pktmbuf_data_len(m) >= off + 1)
+				*rte_pktmbuf_mtod_offset(m, uint16_t *,
+					off) = csum;
+		}
+	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
+		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+	}
+
+	return 0;
+}
+
+static inline uint16_t
+virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
+				    struct rte_mbuf **rx_pkts)
+{
+	uint16_t used_idx, id;
+	uint32_t len;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint32_t hdr_size = hw->vtnet_hdr_size;
+	struct virtio_net_hdr *hdr;
+	struct vring_packed_desc *desc;
+	struct rte_mbuf *cookie;
+
+	desc = vq->vq_packed.ring.desc;
+	used_idx = vq->vq_used_cons_idx;
+	if (!desc_is_used(&desc[used_idx], vq))
+		return -1;
+
+	len = desc[used_idx].len;
+	id = desc[used_idx].id;
+	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
+	if (unlikely(cookie == NULL)) {
+		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
+				vq->vq_used_cons_idx);
+		return -1;
+	}
+	rte_prefetch0(cookie);
+	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
+
+	cookie->data_off = RTE_PKTMBUF_HEADROOM;
+	cookie->ol_flags = 0;
+	cookie->pkt_len = (uint32_t)(len - hdr_size);
+	cookie->data_len = (uint32_t)(len - hdr_size);
+
+	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
+					RTE_PKTMBUF_HEADROOM - hdr_size);
+	if (hw->has_rx_offload)
+		virtio_vec_rx_offload(cookie, hdr);
+
+	*rx_pkts = cookie;
+
+	rxvq->stats.bytes += cookie->pkt_len;
+
+	vq->vq_free_cnt++;
+	vq->vq_used_cons_idx++;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
+static inline void
+virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
+			      struct rte_mbuf **cookie,
+			      uint16_t num)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
+	uint16_t flags = vq->vq_packed.cached_flags;
+	struct virtio_hw *hw = vq->hw;
+	struct vq_desc_extra *dxp;
+	uint16_t idx, i;
+	uint16_t batch_num, total_num = 0;
+	uint16_t head_idx = vq->vq_avail_idx;
+	uint16_t head_flag = vq->vq_packed.cached_flags;
+	uint64_t addr;
+
+	do {
+		idx = vq->vq_avail_idx;
+
+		batch_num = PACKED_BATCH_SIZE;
+		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
+			batch_num = vq->vq_nentries - idx;
+		if (unlikely((total_num + batch_num) > num))
+			batch_num = num - total_num;
+
+		virtio_for_each_try_unroll(i, 0, batch_num) {
+			dxp = &vq->vq_descx[idx + i];
+			dxp->cookie = (void *)cookie[total_num + i];
+
+			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
+				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
+			start_dp[idx + i].addr = addr;
+			start_dp[idx + i].len = cookie[total_num + i]->buf_len
+				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
+			if (total_num || i) {
+				virtqueue_store_flags_packed(&start_dp[idx + i],
+						flags, hw->weak_barriers);
+			}
+		}
+
+		vq->vq_avail_idx += batch_num;
+		if (vq->vq_avail_idx >= vq->vq_nentries) {
+			vq->vq_avail_idx -= vq->vq_nentries;
+			vq->vq_packed.cached_flags ^=
+				VRING_PACKED_DESC_F_AVAIL_USED;
+			flags = vq->vq_packed.cached_flags;
+		}
+		total_num += batch_num;
+	} while (total_num < num);
+
+	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
+				hw->weak_barriers);
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
+}
+
+#endif /* _VIRTIO_RXTX_PACKED_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c b/drivers/net/virtio/virtio_rxtx_packed_avx.c
deleted file mode 100644
index 9bc62719e..000000000
--- a/drivers/net/virtio/virtio_rxtx_packed_avx.c
+++ /dev/null
@@ -1,626 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2020 Intel Corporation
- */
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include <rte_net.h>
-
-#include "virtio_logs.h"
-#include "virtio_ethdev.h"
-#include "virtio_pci.h"
-#include "virtqueue.h"
-
-#define BYTE_SIZE 8
-/* flag bits offset in packed ring desc higher 64bits */
-#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
-	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
-
-#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
-	FLAGS_BITS_OFFSET)
-
-/* reference count offset in mbuf rearm data */
-#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
-	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
-/* segment number offset in mbuf rearm data */
-#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
-	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
-
-/* default rearm data */
-#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
-	1ULL << REFCNT_BITS_OFFSET)
-
-/* id bits offset in packed ring desc higher 64bits */
-#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
-	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
-
-/* net hdr short size mask */
-#define NET_HDR_MASK 0x3F
-
-#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
-	sizeof(struct vring_packed_desc))
-#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
-
-#ifdef VIRTIO_GCC_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifdef VIRTIO_ICC_UNROLL_PRAGMA
-#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
-	for (iter = val; iter < size; iter++)
-#endif
-
-#ifndef virtio_for_each_try_unroll
-#define virtio_for_each_try_unroll(iter, val, num) \
-	for (iter = val; iter < num; iter++)
-#endif
-
-static inline void
-virtio_update_batch_stats(struct virtnet_stats *stats,
-			  uint16_t pkt_len1,
-			  uint16_t pkt_len2,
-			  uint16_t pkt_len3,
-			  uint16_t pkt_len4)
-{
-	stats->bytes += pkt_len1;
-	stats->bytes += pkt_len2;
-	stats->bytes += pkt_len3;
-	stats->bytes += pkt_len4;
-}
-
-static inline int
-virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
-				   struct rte_mbuf **tx_pkts)
-{
-	struct virtqueue *vq = txvq->vq;
-	uint16_t head_size = vq->hw->vtnet_hdr_size;
-	uint16_t idx = vq->vq_avail_idx;
-	struct virtio_net_hdr *hdr;
-	struct vq_desc_extra *dxp;
-	uint16_t i, cmp;
-
-	if (vq->vq_avail_idx & PACKED_BATCH_MASK)
-		return -1;
-
-	if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
-		return -1;
-
-	/* Load four mbufs rearm data */
-	RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
-	RTE_BUILD_BUG_ON(SEG_NUM_BITS_OFFSET >= 64);
-	__m256i mbufs = _mm256_set_epi64x(*tx_pkts[3]->rearm_data,
-					  *tx_pkts[2]->rearm_data,
-					  *tx_pkts[1]->rearm_data,
-					  *tx_pkts[0]->rearm_data);
-
-	/* refcnt=1 and nb_segs=1 */
-	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
-	__m256i head_rooms = _mm256_set1_epi16(head_size);
-
-	/* Check refcnt and nb_segs */
-	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
-	cmp = _mm256_mask_cmpneq_epu16_mask(mask, mbufs, mbuf_ref);
-	if (unlikely(cmp))
-		return -1;
-
-	/* Check headroom is enough */
-	const __mmask16 data_mask = 0x1 | 0x1 << 4 | 0x1 << 8 | 0x1 << 12;
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) !=
-		offsetof(struct rte_mbuf, rearm_data));
-	cmp = _mm256_mask_cmplt_epu16_mask(data_mask, mbufs, head_rooms);
-	if (unlikely(cmp))
-		return -1;
-
-	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-		dxp = &vq->vq_descx[idx + i];
-		dxp->ndescs = 1;
-		dxp->cookie = tx_pkts[i];
-	}
-
-	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-		tx_pkts[i]->data_off -= head_size;
-		tx_pkts[i]->data_len += head_size;
-	}
-
-	__m512i descs_base = _mm512_set_epi64(tx_pkts[3]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[3], vq),
-			tx_pkts[2]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[2], vq),
-			tx_pkts[1]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[1], vq),
-			tx_pkts[0]->data_len,
-			VIRTIO_MBUF_ADDR(tx_pkts[0], vq));
-
-	/* id offset and data offset */
-	__m512i data_offsets = _mm512_set_epi64((uint64_t)3 << ID_BITS_OFFSET,
-						tx_pkts[3]->data_off,
-						(uint64_t)2 << ID_BITS_OFFSET,
-						tx_pkts[2]->data_off,
-						(uint64_t)1 << ID_BITS_OFFSET,
-						tx_pkts[1]->data_off,
-						0, tx_pkts[0]->data_off);
-
-	__m512i new_descs = _mm512_add_epi64(descs_base, data_offsets);
-
-	uint64_t flags_temp = (uint64_t)idx << ID_BITS_OFFSET |
-		(uint64_t)vq->vq_packed.cached_flags << FLAGS_BITS_OFFSET;
-
-	/* flags offset and guest virtual address offset */
-	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
-	__m512i v_offset = _mm512_broadcast_i32x4(flag_offset);
-	__m512i v_desc = _mm512_add_epi64(new_descs, v_offset);
-
-	if (!vq->hw->has_tx_offload) {
-		__m128i all_mask = _mm_set1_epi16(0xFFFF);
-		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
-					struct virtio_net_hdr *, -head_size);
-			__m128i v_hdr = _mm_loadu_si128((void *)hdr);
-			if (unlikely(_mm_mask_test_epi16_mask(NET_HDR_MASK,
-							v_hdr, all_mask))) {
-				__m128i all_zero = _mm_setzero_si128();
-				_mm_mask_storeu_epi16((void *)hdr,
-						NET_HDR_MASK, all_zero);
-			}
-		}
-	} else {
-		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
-					struct virtio_net_hdr *, -head_size);
-			virtqueue_xmit_offload(hdr, tx_pkts[i], true);
-		}
-	}
-
-	/* Enqueue Packet buffers */
-	_mm512_storeu_si512((void *)&vq->vq_packed.ring.desc[idx], v_desc);
-
-	virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len,
-			tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len,
-			tx_pkts[3]->pkt_len);
-
-	vq->vq_avail_idx += PACKED_BATCH_SIZE;
-	vq->vq_free_cnt -= PACKED_BATCH_SIZE;
-
-	if (vq->vq_avail_idx >= vq->vq_nentries) {
-		vq->vq_avail_idx -= vq->vq_nentries;
-		vq->vq_packed.cached_flags ^=
-			VRING_PACKED_DESC_F_AVAIL_USED;
-	}
-
-	return 0;
-}
-
-static inline int
-virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq,
-				    struct rte_mbuf *txm)
-{
-	struct virtqueue *vq = txvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t hdr_size = hw->vtnet_hdr_size;
-	uint16_t slots, can_push = 0, use_indirect = 0;
-	int16_t need;
-
-	/* optimize ring usage */
-	if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||
-	      vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&
-	    rte_mbuf_refcnt_read(txm) == 1 &&
-	    RTE_MBUF_DIRECT(txm) &&
-	    txm->nb_segs == 1 &&
-	    rte_pktmbuf_headroom(txm) >= hdr_size)
-		can_push = 1;
-	else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
-		 txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
-		use_indirect = 1;
-	/* How many main ring entries are needed to this Tx?
-	 * indirect   => 1
-	 * any_layout => number of segments
-	 * default    => number of segments + 1
-	 */
-	slots = use_indirect ? 1 : (txm->nb_segs + !can_push);
-	need = slots - vq->vq_free_cnt;
-
-	/* Positive value indicates it need free vring descriptors */
-	if (unlikely(need > 0)) {
-		virtio_xmit_cleanup_inorder_packed(vq, need);
-		need = slots - vq->vq_free_cnt;
-		if (unlikely(need > 0)) {
-			PMD_TX_LOG(ERR,
-				   "No free tx descriptors to transmit");
-			return -1;
-		}
-	}
-
-	/* Enqueue Packet buffers */
-	virtqueue_enqueue_xmit_packed(txvq, txm, slots, use_indirect,
-				can_push, 1);
-
-	txvq->stats.bytes += txm->pkt_len;
-	return 0;
-}
-
-uint16_t
-virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
-			uint16_t nb_pkts)
-{
-	struct virtnet_tx *txvq = tx_queue;
-	struct virtqueue *vq = txvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t nb_tx = 0;
-	uint16_t remained;
-
-	if (unlikely(hw->started == 0 && tx_pkts != hw->inject_pkts))
-		return nb_tx;
-
-	if (unlikely(nb_pkts < 1))
-		return nb_pkts;
-
-	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
-
-	if (vq->vq_free_cnt <= vq->vq_nentries - vq->vq_free_thresh)
-		virtio_xmit_cleanup_inorder_packed(vq, vq->vq_free_thresh);
-
-	remained = RTE_MIN(nb_pkts, vq->vq_free_cnt);
-
-	while (remained) {
-		if (remained >= PACKED_BATCH_SIZE) {
-			if (!virtqueue_enqueue_batch_packed_vec(txvq,
-						&tx_pkts[nb_tx])) {
-				nb_tx += PACKED_BATCH_SIZE;
-				remained -= PACKED_BATCH_SIZE;
-				continue;
-			}
-		}
-		if (!virtqueue_enqueue_single_packed_vec(txvq,
-					tx_pkts[nb_tx])) {
-			nb_tx++;
-			remained--;
-			continue;
-		}
-		break;
-	};
-
-	txvq->stats.packets += nb_tx;
-
-	if (likely(nb_tx)) {
-		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
-			virtqueue_notify(vq);
-			PMD_TX_LOG(DEBUG, "Notified backend after xmit");
-		}
-	}
-
-	return nb_tx;
-}
-
-/* Optionally fill offload information in structure */
-static inline int
-virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
-{
-	struct rte_net_hdr_lens hdr_lens;
-	uint32_t hdrlen, ptype;
-	int l4_supported = 0;
-
-	/* nothing to do */
-	if (hdr->flags == 0)
-		return 0;
-
-	/* GSO not support in vec path, skip check */
-	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
-
-	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
-	m->packet_type = ptype;
-	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
-	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
-	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
-		l4_supported = 1;
-
-	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
-		if (hdr->csum_start <= hdrlen && l4_supported) {
-			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
-		} else {
-			/* Unknown proto or tunnel, do sw cksum. We can assume
-			 * the cksum field is in the first segment since the
-			 * buffers we provided to the host are large enough.
-			 * In case of SCTP, this will be wrong since it's a CRC
-			 * but there's nothing we can do.
-			 */
-			uint16_t csum = 0, off;
-
-			if (rte_raw_cksum_mbuf(m, hdr->csum_start,
-				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
-				&csum) < 0)
-				return -1;
-			if (likely(csum != 0xffff))
-				csum = ~csum;
-			off = hdr->csum_offset + hdr->csum_start;
-			if (rte_pktmbuf_data_len(m) >= off + 1)
-				*rte_pktmbuf_mtod_offset(m, uint16_t *,
-					off) = csum;
-		}
-	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
-		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
-	}
-
-	return 0;
-}
-
-static inline uint16_t
-virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
-				   struct rte_mbuf **rx_pkts)
-{
-	struct virtqueue *vq = rxvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t hdr_size = hw->vtnet_hdr_size;
-	uint64_t addrs[PACKED_BATCH_SIZE];
-	uint16_t id = vq->vq_used_cons_idx;
-	uint8_t desc_stats;
-	uint16_t i;
-	void *desc_addr;
-
-	if (id & PACKED_BATCH_MASK)
-		return -1;
-
-	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
-		return -1;
-
-	/* only care avail/used bits */
-#if defined(RTE_ARCH_I686)
-	__m512i v_mask = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
-					   PACKED_FLAGS_MASK, 0x0);
-#else
-	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
-#endif
-	desc_addr = &vq->vq_packed.ring.desc[id];
-
-	__m512i v_desc = _mm512_loadu_si512(desc_addr);
-	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
-
-	__m512i v_used_flag = _mm512_setzero_si512();
-	if (vq->vq_packed.used_wrap_counter)
-#if defined(RTE_ARCH_I686)
-		v_used_flag = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
-						PACKED_FLAGS_MASK, 0x0);
-#else
-		v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
-#endif
-
-	/* Check all descs are used */
-	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
-	if (desc_stats)
-		return -1;
-
-	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
-		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
-
-		addrs[i] = (uintptr_t)rx_pkts[i]->rx_descriptor_fields1;
-	}
-
-	/*
-	 * load len from desc, store into mbuf pkt_len and data_len
-	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
-	 */
-	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
-	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA);
-
-	/* reduce hdr_len from pkt_len and data_len */
-	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
-			(uint32_t)-hdr_size);
-
-	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
-
-	/* assert offset of data_len */
-	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
-		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
-
-	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
-					   addrs[2] + 8, addrs[2],
-					   addrs[1] + 8, addrs[1],
-					   addrs[0] + 8, addrs[0]);
-	/* batch store into mbufs */
-	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
-
-	if (hw->has_rx_offload) {
-		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-			char *addr = (char *)rx_pkts[i]->buf_addr +
-				RTE_PKTMBUF_HEADROOM - hdr_size;
-			virtio_vec_rx_offload(rx_pkts[i],
-					(struct virtio_net_hdr *)addr);
-		}
-	}
-
-	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
-			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
-			rx_pkts[3]->pkt_len);
-
-	vq->vq_free_cnt += PACKED_BATCH_SIZE;
-
-	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
-	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
-		vq->vq_used_cons_idx -= vq->vq_nentries;
-		vq->vq_packed.used_wrap_counter ^= 1;
-	}
-
-	return 0;
-}
-
-static uint16_t
-virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
-				    struct rte_mbuf **rx_pkts)
-{
-	uint16_t used_idx, id;
-	uint32_t len;
-	struct virtqueue *vq = rxvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint32_t hdr_size = hw->vtnet_hdr_size;
-	struct virtio_net_hdr *hdr;
-	struct vring_packed_desc *desc;
-	struct rte_mbuf *cookie;
-
-	desc = vq->vq_packed.ring.desc;
-	used_idx = vq->vq_used_cons_idx;
-	if (!desc_is_used(&desc[used_idx], vq))
-		return -1;
-
-	len = desc[used_idx].len;
-	id = desc[used_idx].id;
-	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
-	if (unlikely(cookie == NULL)) {
-		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
-				vq->vq_used_cons_idx);
-		return -1;
-	}
-	rte_prefetch0(cookie);
-	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
-
-	cookie->data_off = RTE_PKTMBUF_HEADROOM;
-	cookie->ol_flags = 0;
-	cookie->pkt_len = (uint32_t)(len - hdr_size);
-	cookie->data_len = (uint32_t)(len - hdr_size);
-
-	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
-					RTE_PKTMBUF_HEADROOM - hdr_size);
-	if (hw->has_rx_offload)
-		virtio_vec_rx_offload(cookie, hdr);
-
-	*rx_pkts = cookie;
-
-	rxvq->stats.bytes += cookie->pkt_len;
-
-	vq->vq_free_cnt++;
-	vq->vq_used_cons_idx++;
-	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
-		vq->vq_used_cons_idx -= vq->vq_nentries;
-		vq->vq_packed.used_wrap_counter ^= 1;
-	}
-
-	return 0;
-}
-
-static inline void
-virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
-			      struct rte_mbuf **cookie,
-			      uint16_t num)
-{
-	struct virtqueue *vq = rxvq->vq;
-	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
-	uint16_t flags = vq->vq_packed.cached_flags;
-	struct virtio_hw *hw = vq->hw;
-	struct vq_desc_extra *dxp;
-	uint16_t idx, i;
-	uint16_t batch_num, total_num = 0;
-	uint16_t head_idx = vq->vq_avail_idx;
-	uint16_t head_flag = vq->vq_packed.cached_flags;
-	uint64_t addr;
-
-	do {
-		idx = vq->vq_avail_idx;
-
-		batch_num = PACKED_BATCH_SIZE;
-		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
-			batch_num = vq->vq_nentries - idx;
-		if (unlikely((total_num + batch_num) > num))
-			batch_num = num - total_num;
-
-		virtio_for_each_try_unroll(i, 0, batch_num) {
-			dxp = &vq->vq_descx[idx + i];
-			dxp->cookie = (void *)cookie[total_num + i];
-
-			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
-				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
-			start_dp[idx + i].addr = addr;
-			start_dp[idx + i].len = cookie[total_num + i]->buf_len
-				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
-			if (total_num || i) {
-				virtqueue_store_flags_packed(&start_dp[idx + i],
-						flags, hw->weak_barriers);
-			}
-		}
-
-		vq->vq_avail_idx += batch_num;
-		if (vq->vq_avail_idx >= vq->vq_nentries) {
-			vq->vq_avail_idx -= vq->vq_nentries;
-			vq->vq_packed.cached_flags ^=
-				VRING_PACKED_DESC_F_AVAIL_USED;
-			flags = vq->vq_packed.cached_flags;
-		}
-		total_num += batch_num;
-	} while (total_num < num);
-
-	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
-				hw->weak_barriers);
-	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
-}
-
-uint16_t
-virtio_recv_pkts_packed_vec(void *rx_queue,
-			    struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
-{
-	struct virtnet_rx *rxvq = rx_queue;
-	struct virtqueue *vq = rxvq->vq;
-	struct virtio_hw *hw = vq->hw;
-	uint16_t num, nb_rx = 0;
-	uint32_t nb_enqueued = 0;
-	uint16_t free_cnt = vq->vq_free_thresh;
-
-	if (unlikely(hw->started == 0))
-		return nb_rx;
-
-	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
-	if (likely(num > PACKED_BATCH_SIZE))
-		num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);
-
-	while (num) {
-		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
-					&rx_pkts[nb_rx])) {
-			nb_rx += PACKED_BATCH_SIZE;
-			num -= PACKED_BATCH_SIZE;
-			continue;
-		}
-		if (!virtqueue_dequeue_single_packed_vec(rxvq,
-					&rx_pkts[nb_rx])) {
-			nb_rx++;
-			num--;
-			continue;
-		}
-		break;
-	};
-
-	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
-
-	rxvq->stats.packets += nb_rx;
-
-	if (likely(vq->vq_free_cnt >= free_cnt)) {
-		struct rte_mbuf *new_pkts[free_cnt];
-		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
-						free_cnt) == 0)) {
-			virtio_recv_refill_packed_vec(rxvq, new_pkts,
-					free_cnt);
-			nb_enqueued += free_cnt;
-		} else {
-			struct rte_eth_dev *dev =
-				&rte_eth_devices[rxvq->port_id];
-			dev->data->rx_mbuf_alloc_failed += free_cnt;
-		}
-	}
-
-	if (likely(nb_enqueued)) {
-		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
-			virtqueue_notify(vq);
-			PMD_RX_LOG(DEBUG, "Notified");
-		}
-	}
-
-	return nb_rx;
-}
diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.h b/drivers/net/virtio/virtio_rxtx_packed_avx.h
new file mode 100644
index 000000000..f83182884
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed_avx.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+static inline int
+virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
+				   struct rte_mbuf **tx_pkts)
+{
+	struct virtqueue *vq = txvq->vq;
+	uint16_t head_size = vq->hw->vtnet_hdr_size;
+	uint16_t idx = vq->vq_avail_idx;
+	struct virtio_net_hdr *hdr;
+	struct vq_desc_extra *dxp;
+	uint16_t i, cmp;
+
+	if (vq->vq_avail_idx & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* Load four mbufs rearm data */
+	RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
+	RTE_BUILD_BUG_ON(SEG_NUM_BITS_OFFSET >= 64);
+	__m256i mbufs = _mm256_set_epi64x(*tx_pkts[3]->rearm_data,
+					  *tx_pkts[2]->rearm_data,
+					  *tx_pkts[1]->rearm_data,
+					  *tx_pkts[0]->rearm_data);
+
+	/* refcnt=1 and nb_segs=1 */
+	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
+	__m256i head_rooms = _mm256_set1_epi16(head_size);
+
+	/* Check refcnt and nb_segs */
+	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
+	cmp = _mm256_mask_cmpneq_epu16_mask(mask, mbufs, mbuf_ref);
+	if (unlikely(cmp))
+		return -1;
+
+	/* Check headroom is enough */
+	const __mmask16 data_mask = 0x1 | 0x1 << 4 | 0x1 << 8 | 0x1 << 12;
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) !=
+		offsetof(struct rte_mbuf, rearm_data));
+	cmp = _mm256_mask_cmplt_epu16_mask(data_mask, mbufs, head_rooms);
+	if (unlikely(cmp))
+		return -1;
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		dxp = &vq->vq_descx[idx + i];
+		dxp->ndescs = 1;
+		dxp->cookie = tx_pkts[i];
+	}
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		tx_pkts[i]->data_off -= head_size;
+		tx_pkts[i]->data_len += head_size;
+	}
+
+	__m512i descs_base = _mm512_set_epi64(tx_pkts[3]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[3], vq),
+			tx_pkts[2]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[2], vq),
+			tx_pkts[1]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[1], vq),
+			tx_pkts[0]->data_len,
+			VIRTIO_MBUF_ADDR(tx_pkts[0], vq));
+
+	/* id offset and data offset */
+	__m512i data_offsets = _mm512_set_epi64((uint64_t)3 << ID_BITS_OFFSET,
+						tx_pkts[3]->data_off,
+						(uint64_t)2 << ID_BITS_OFFSET,
+						tx_pkts[2]->data_off,
+						(uint64_t)1 << ID_BITS_OFFSET,
+						tx_pkts[1]->data_off,
+						0, tx_pkts[0]->data_off);
+
+	__m512i new_descs = _mm512_add_epi64(descs_base, data_offsets);
+
+	uint64_t flags_temp = (uint64_t)idx << ID_BITS_OFFSET |
+		(uint64_t)vq->vq_packed.cached_flags << FLAGS_BITS_OFFSET;
+
+	/* flags offset and guest virtual address offset */
+	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
+	__m512i v_offset = _mm512_broadcast_i32x4(flag_offset);
+	__m512i v_desc = _mm512_add_epi64(new_descs, v_offset);
+
+	if (!vq->hw->has_tx_offload) {
+		__m128i all_mask = _mm_set1_epi16(0xFFFF);
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
+					struct virtio_net_hdr *, -head_size);
+			__m128i v_hdr = _mm_loadu_si128((void *)hdr);
+			if (unlikely(_mm_mask_test_epi16_mask(NET_HDR_MASK,
+							v_hdr, all_mask))) {
+				__m128i all_zero = _mm_setzero_si128();
+				_mm_mask_storeu_epi16((void *)hdr,
+						NET_HDR_MASK, all_zero);
+			}
+		}
+	} else {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
+					struct virtio_net_hdr *, -head_size);
+			virtqueue_xmit_offload(hdr, tx_pkts[i], true);
+		}
+	}
+
+	/* Enqueue Packet buffers */
+	_mm512_storeu_si512((void *)&vq->vq_packed.ring.desc[idx], v_desc);
+
+	virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len,
+			tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len,
+			tx_pkts[3]->pkt_len);
+
+	vq->vq_avail_idx += PACKED_BATCH_SIZE;
+	vq->vq_free_cnt -= PACKED_BATCH_SIZE;
+
+	if (vq->vq_avail_idx >= vq->vq_nentries) {
+		vq->vq_avail_idx -= vq->vq_nentries;
+		vq->vq_packed.cached_flags ^=
+			VRING_PACKED_DESC_F_AVAIL_USED;
+	}
+
+	return 0;
+}
+
+static inline uint16_t
+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
+				   struct rte_mbuf **rx_pkts)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t hdr_size = hw->vtnet_hdr_size;
+	uint64_t addrs[PACKED_BATCH_SIZE];
+	uint16_t id = vq->vq_used_cons_idx;
+	uint8_t desc_stats;
+	uint16_t i;
+	void *desc_addr;
+
+	if (id & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* only care avail/used bits */
+#if defined(RTE_ARCH_I686)
+	__m512i v_mask = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
+					   PACKED_FLAGS_MASK, 0x0);
+#else
+	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
+#endif
+	desc_addr = &vq->vq_packed.ring.desc[id];
+
+	__m512i v_desc = _mm512_loadu_si512(desc_addr);
+	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
+
+	__m512i v_used_flag = _mm512_setzero_si512();
+	if (vq->vq_packed.used_wrap_counter)
+#if defined(RTE_ARCH_I686)
+		v_used_flag = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
+						PACKED_FLAGS_MASK, 0x0);
+#else
+		v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
+#endif
+
+	/* Check all descs are used */
+	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
+	if (desc_stats)
+		return -1;
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
+		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
+
+		addrs[i] = (uintptr_t)rx_pkts[i]->rx_descriptor_fields1;
+	}
+
+	/*
+	 * load len from desc, store into mbuf pkt_len and data_len
+	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
+	 */
+	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
+	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA);
+
+	/* reduce hdr_len from pkt_len and data_len */
+	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
+			(uint32_t)-hdr_size);
+
+	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
+
+	/* assert offset of data_len */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+
+	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
+					   addrs[2] + 8, addrs[2],
+					   addrs[1] + 8, addrs[1],
+					   addrs[0] + 8, addrs[0]);
+	/* batch store into mbufs */
+	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
+
+	if (hw->has_rx_offload) {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			char *addr = (char *)rx_pkts[i]->buf_addr +
+				RTE_PKTMBUF_HEADROOM - hdr_size;
+			virtio_vec_rx_offload(rx_pkts[i],
+					(struct virtio_net_hdr *)addr);
+		}
+	}
+
+	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
+			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
+			rx_pkts[3]->pkt_len);
+
+	vq->vq_free_cnt += PACKED_BATCH_SIZE;
+
+	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
@ 2020-11-17 10:06   ` Joyce Kong
  2021-01-05 14:16     ` Maxime Coquelin
  2021-01-08 17:02     ` Ferruh Yigit
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx " Joyce Kong
                     ` (2 subsequent siblings)
  4 siblings, 2 replies; 26+ messages in thread
From: Joyce Kong @ 2020-11-17 10:06 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd

Optimize packed ring Rx batch path with NEON instructions.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
 drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h

diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
index b0b1d63ec..8f5198ad7 100644
--- a/drivers/net/virtio/virtio_rxtx_packed.h
+++ b/drivers/net/virtio/virtio_rxtx_packed.h
@@ -19,9 +19,16 @@
 #include "virtqueue.h"
 
 #define BYTE_SIZE 8
+
+#ifdef CC_AVX512_SUPPORT
 /* flag bits offset in packed ring desc higher 64bits */
 #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
 	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+#elif defined(RTE_ARCH_ARM)
+/* flag bits offset in packed ring desc from ID */
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
+#endif
 
 #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
 	FLAGS_BITS_OFFSET)
@@ -44,8 +51,16 @@
 /* net hdr short size mask */
 #define NET_HDR_MASK 0x3F
 
+#ifdef RTE_ARCH_ARM
+/* The cache line size on different Arm platforms are different, so
+ * put a four batch size here to match with the minimum cache line
+ * size and accommodate NEON register size.
+ */
+#define PACKED_BATCH_SIZE 4
+#else
 #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
 	sizeof(struct vring_packed_desc))
+#endif
 #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
 
 #ifdef VIRTIO_GCC_UNROLL_PRAGMA
diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
new file mode 100644
index 000000000..fb1e49909
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Arm Corporation
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+#include <rte_vect.h>
+
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtio_rxtx_packed.h"
+#include "virtqueue.h"
+
+static inline uint16_t
+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
+				   struct rte_mbuf **rx_pkts)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t head_size = hw->vtnet_hdr_size;
+	uint16_t id = vq->vq_used_cons_idx;
+	struct vring_packed_desc *p_desc;
+	uint16_t i;
+
+	if (id & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* Map packed descriptor to mbuf fields. */
+	uint8x16_t shuf_msk1 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
+		0, 1,			/* octet 1~0, low 16 bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
+		0, 1,			/* octet 1~0, 16 bits data_len */
+		0xFF, 0xFF,		/* vlan tci set as unknown */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	uint8x16_t shuf_msk2 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
+		8, 9,			/* octet 9~8, low 16 bits pkt_len */
+		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
+		8, 9,			/* octet 9~8, 16 bits data_len */
+		0xFF, 0xFF,		/* vlan tci set as unknown */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	/* Subtract the header length. */
+	uint16x8_t len_adjust = {
+		0, 0,		/* ignore pkt_type field */
+		head_size,	/* sub head_size on pkt_len */
+		0,		/* ignore high 16 bits of pkt_len */
+		head_size,	/* sub head_size on data_len */
+		0, 0, 0		/* ignore non-length fields */
+	};
+
+	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
+	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
+	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
+
+	p_desc = &vq->vq_packed.ring.desc[id];
+	/* Load high 64 bits of packed descriptor 0,1. */
+	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
+	/* Load high 64 bits of packed descriptor 2,3. */
+	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
+
+	/* Only care avail/used bits. */
+	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
+	/* Extract high 32 bits of packed descriptor (id, flags). */
+	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
+				vreinterpretq_u32_u64(desc[1]));
+	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
+
+	uint32x4_t v_used_flag = vdupq_n_u32(0);
+	if (vq->vq_packed.used_wrap_counter)
+		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
+
+	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
+
+	/* Check all descs are used. */
+	if (desc_stats)
+		return -1;
+
+	/* Load 2 mbuf pointers per time. */
+	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
+	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
+
+	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
+	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
+
+	/**
+	 *  Update data length and packet length for descriptor.
+	 *  structure of pkt_mb:
+	 *  --------------------------------------------------------------------
+	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
+	 *  --------------------------------------------------------------------
+	 */
+	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
+	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+
+	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+
+	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
+	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
+	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
+	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
+
+	if (hw->has_rx_offload) {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			char *addr = (char *)rx_pkts[i]->buf_addr +
+				RTE_PKTMBUF_HEADROOM - head_size;
+			virtio_vec_rx_offload(rx_pkts[i],
+					(struct virtio_net_hdr *)addr);
+		}
+	}
+
+	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
+			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
+			rx_pkts[3]->pkt_len);
+
+	vq->vq_free_cnt += PACKED_BATCH_SIZE;
+
+	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx NEON path
  2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
@ 2020-11-17 10:06   ` Joyce Kong
  2021-01-05 14:33     ` Maxime Coquelin
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector " Joyce Kong
  2021-01-08  9:11   ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX path with NEON Maxime Coquelin
  4 siblings, 1 reply; 26+ messages in thread
From: Joyce Kong @ 2020-11-17 10:06 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd

Optimize packed ring Tx batch path with NEON instructions.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 drivers/net/virtio/virtio_rxtx_packed.h      |   6 +-
 drivers/net/virtio/virtio_rxtx_packed_neon.h | 143 +++++++++++++++++++
 2 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
index 8f5198ad7..016b6fb24 100644
--- a/drivers/net/virtio/virtio_rxtx_packed.h
+++ b/drivers/net/virtio/virtio_rxtx_packed.h
@@ -28,6 +28,8 @@
 /* flag bits offset in packed ring desc from ID */
 #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
 	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
+#define FLAGS_LEN_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
 #endif
 
 #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
@@ -36,13 +38,15 @@
 /* reference count offset in mbuf rearm data */
 #define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
 	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+
+#ifdef CC_AVX512_SUPPORT
 /* segment number offset in mbuf rearm data */
 #define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
 	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
-
 /* default rearm data */
 #define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
 	1ULL << REFCNT_BITS_OFFSET)
+#endif
 
 /* id bits offset in packed ring desc higher 64bits */
 #define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
index fb1e49909..041f771ea 100644
--- a/drivers/net/virtio/virtio_rxtx_packed_neon.h
+++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
@@ -16,6 +16,149 @@
 #include "virtio_rxtx_packed.h"
 #include "virtqueue.h"
 
+static inline int
+virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
+				   struct rte_mbuf **tx_pkts)
+{
+	struct virtqueue *vq = txvq->vq;
+	uint16_t head_size = vq->hw->vtnet_hdr_size;
+	uint16_t idx = vq->vq_avail_idx;
+	struct virtio_net_hdr *hdr;
+	struct vq_desc_extra *dxp;
+	struct vring_packed_desc *p_desc;
+	uint16_t i;
+
+	if (idx & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* Map four refcnt and nb_segs from mbufs to one NEON register. */
+	uint8x16_t ref_seg_msk = {
+		2, 3, 4, 5,
+		10, 11, 12, 13,
+		18, 19, 20, 21,
+		26, 27, 28, 29
+	};
+
+	/* Map four data_off from mbufs to one NEON register. */
+	uint8x8_t data_msk = {
+		0, 1,
+		8, 9,
+		16, 17,
+		24, 25
+	};
+
+	uint16x8_t net_hdr_msk = {
+		0xFFFF, 0xFFFF,
+		0, 0, 0, 0
+	};
+
+	uint16x4_t pkts[PACKED_BATCH_SIZE];
+	uint8x16x2_t mbuf;
+	/* Load four mbufs rearm data. */
+	RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
+	pkts[0] = vld1_u16((uint16_t *)&tx_pkts[0]->rearm_data);
+	pkts[1] = vld1_u16((uint16_t *)&tx_pkts[1]->rearm_data);
+	pkts[2] = vld1_u16((uint16_t *)&tx_pkts[2]->rearm_data);
+	pkts[3] = vld1_u16((uint16_t *)&tx_pkts[3]->rearm_data);
+
+	mbuf.val[0] = vreinterpretq_u8_u16(vcombine_u16(pkts[0], pkts[1]));
+	mbuf.val[1] = vreinterpretq_u8_u16(vcombine_u16(pkts[2], pkts[3]));
+
+	/* refcnt = 1 and nb_segs = 1 */
+	uint32x4_t def_ref_seg = vdupq_n_u32(0x10001);
+	/* Check refcnt and nb_segs. */
+	uint32x4_t ref_seg = vreinterpretq_u32_u8(vqtbl2q_u8(mbuf, ref_seg_msk));
+	poly128_t cmp1 = vreinterpretq_p128_u32(~vceqq_u32(ref_seg, def_ref_seg));
+	if (unlikely(cmp1))
+		return -1;
+
+	/* Check headroom is enough. */
+	uint16x4_t head_rooms = vdup_n_u16(head_size);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) !=
+			 offsetof(struct rte_mbuf, rearm_data));
+	uint16x4_t data_offset = vreinterpret_u16_u8(vqtbl2_u8(mbuf, data_msk));
+	uint64x1_t cmp2 = vreinterpret_u64_u16(vclt_u16(data_offset, head_rooms));
+	if (unlikely(vget_lane_u64(cmp2, 0)))
+		return -1;
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		dxp = &vq->vq_descx[idx + i];
+		dxp->ndescs = 1;
+		dxp->cookie = tx_pkts[i];
+	}
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		tx_pkts[i]->data_off -= head_size;
+		tx_pkts[i]->data_len += head_size;
+	}
+
+	uint64x2x2_t desc[PACKED_BATCH_SIZE / 2];
+	uint64x2_t base_addr0 = {
+		VIRTIO_MBUF_ADDR(tx_pkts[0], vq) + tx_pkts[0]->data_off,
+		VIRTIO_MBUF_ADDR(tx_pkts[1], vq) + tx_pkts[1]->data_off
+	};
+	uint64x2_t base_addr1 = {
+		VIRTIO_MBUF_ADDR(tx_pkts[2], vq) + tx_pkts[2]->data_off,
+		VIRTIO_MBUF_ADDR(tx_pkts[3], vq) + tx_pkts[3]->data_off
+	};
+
+	desc[0].val[0] = base_addr0;
+	desc[1].val[0] = base_addr1;
+
+	uint64_t flags = (uint64_t)vq->vq_packed.cached_flags << FLAGS_LEN_BITS_OFFSET;
+	uint64x2_t tx_desc0 = {
+		flags | (uint64_t)idx << ID_BITS_OFFSET | tx_pkts[0]->data_len,
+		flags | (uint64_t)(idx + 1) << ID_BITS_OFFSET | tx_pkts[1]->data_len
+	};
+
+	uint64x2_t tx_desc1 = {
+		flags | (uint64_t)(idx + 2) << ID_BITS_OFFSET | tx_pkts[2]->data_len,
+		flags | (uint64_t)(idx + 3) << ID_BITS_OFFSET | tx_pkts[3]->data_len
+	};
+
+	desc[0].val[1] = tx_desc0;
+	desc[1].val[1] = tx_desc1;
+
+	if (!vq->hw->has_tx_offload) {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
+					struct virtio_net_hdr *, -head_size);
+			/* Clear net hdr. */
+			uint16x8_t v_hdr = vld1q_u16((void *)hdr);
+			vst1q_u16((void *)hdr, vandq_u16(v_hdr, net_hdr_msk));
+		}
+	} else {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
+					struct virtio_net_hdr *, -head_size);
+			virtqueue_xmit_offload(hdr, tx_pkts[i], true);
+		}
+	}
+
+	/* Enqueue packet buffers. */
+	p_desc = &vq->vq_packed.ring.desc[idx];
+	vst2q_u64((uint64_t *)p_desc, desc[0]);
+	vst2q_u64((uint64_t *)(p_desc + 2), desc[1]);
+
+	virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len,
+			tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len,
+			tx_pkts[3]->pkt_len);
+
+	vq->vq_avail_idx += PACKED_BATCH_SIZE;
+	vq->vq_free_cnt -= PACKED_BATCH_SIZE;
+
+	if (vq->vq_avail_idx >= vq->vq_nentries) {
+		vq->vq_avail_idx -= vq->vq_nentries;
+		vq->vq_packed.cached_flags ^=
+			VRING_PACKED_DESC_F_AVAIL_USED;
+	}
+
+	return 0;
+}
+
 static inline uint16_t
 virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
 				   struct rte_mbuf **rx_pkts)
-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector NEON path
  2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
                     ` (2 preceding siblings ...)
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx " Joyce Kong
@ 2020-11-17 10:06   ` Joyce Kong
  2021-01-05 14:42     ` Maxime Coquelin
  2021-01-08  9:11   ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX path with NEON Maxime Coquelin
  4 siblings, 1 reply; 26+ messages in thread
From: Joyce Kong @ 2020-11-17 10:06 UTC (permalink / raw)
  To: maxime.coquelin, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd

Add NEON vectorized path selection logic. Default setting comes from
vectorized devarg, then checks each criteria.

Packed ring vectorized neon path need:
    NEON is supported by compiler and host
    VERSION_1 and IN_ORDER features are negotiated
    mergeable feature is not negotiated
    LRO offloading is disabled

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 doc/guides/nics/virtio.rst              |  6 +++---
 drivers/net/virtio/meson.build          |  1 +
 drivers/net/virtio/virtio_ethdev.c      | 19 +++++++++++++++----
 drivers/net/virtio/virtio_rxtx_packed.c |  2 ++
 drivers/net/virtio/virtio_user_ethdev.c |  2 +-
 5 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/doc/guides/nics/virtio.rst b/doc/guides/nics/virtio.rst
index c03c2d0fe..b7be3aca1 100644
--- a/doc/guides/nics/virtio.rst
+++ b/doc/guides/nics/virtio.rst
@@ -483,11 +483,11 @@ according to below configuration:
 #. Packed virtqueue in-order non-mergeable path: If in-order feature is negotiated and
    Rx mergeable is not negotiated, this path will be selected.
 #. Packed virtqueue vectorized Rx path: If building and running environment support
-   AVX512 && in-order feature is negotiated && Rx mergeable is not negotiated &&
-   TCP_LRO Rx offloading is disabled && vectorized option enabled,
+   (AVX512 || NEON) && in-order feature is negotiated && Rx mergeable
+   is not negotiated && TCP_LRO Rx offloading is disabled && vectorized option enabled,
    this path will be selected.
 #. Packed virtqueue vectorized Tx path: If building and running environment support
-   AVX512 && in-order feature is negotiated && vectorized option enabled,
+   (AVX512 || NEON)  && in-order feature is negotiated && vectorized option enabled,
    this path will be selected.
 
 Rx/Tx callbacks of each Virtio path
diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
index 01b8de6d4..738d66746 100644
--- a/drivers/net/virtio/meson.build
+++ b/drivers/net/virtio/meson.build
@@ -32,6 +32,7 @@ if arch_subdir == 'x86'
 elif arch_subdir == 'ppc'
 	sources += files('virtio_rxtx_simple_altivec.c')
 elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64')
+	sources += files('virtio_rxtx_packed.c')
 	sources += files('virtio_rxtx_simple_neon.c')
 endif
 
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 6c233b75b..54a6d6ca9 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -1967,12 +1967,12 @@ eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
 		if (!vtpci_packed_queue(hw)) {
 			hw->use_vec_rx = 1;
 		} else {
-#if !defined(CC_AVX512_SUPPORT)
-			PMD_DRV_LOG(INFO,
-				"building environment do not support packed ring vectorized");
-#else
+#if defined(CC_AVX512_SUPPORT) || defined(RTE_ARCH_ARM)
 			hw->use_vec_rx = 1;
 			hw->use_vec_tx = 1;
+#else
+			PMD_DRV_LOG(INFO,
+				"building environment do not support packed ring vectorized");
 #endif
 		}
 	}
@@ -2320,6 +2320,17 @@ virtio_dev_configure(struct rte_eth_dev *dev)
 			hw->use_vec_rx = 0;
 			hw->use_vec_tx = 0;
 		}
+#elif defined(RTE_ARCH_ARM)
+		if ((hw->use_vec_rx || hw->use_vec_tx) &&
+		    (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) ||
+		     !vtpci_with_feature(hw, VIRTIO_F_IN_ORDER) ||
+		     !vtpci_with_feature(hw, VIRTIO_F_VERSION_1) ||
+		     rte_vect_get_max_simd_bitwidth() < RTE_VECT_SIMD_128)) {
+			PMD_DRV_LOG(INFO,
+				"disabled packed ring vectorized path for requirements not met");
+			hw->use_vec_rx = 0;
+			hw->use_vec_tx = 0;
+		}
 #else
 		hw->use_vec_rx = 0;
 		hw->use_vec_tx = 0;
diff --git a/drivers/net/virtio/virtio_rxtx_packed.c b/drivers/net/virtio/virtio_rxtx_packed.c
index 99d9a5a99..882dca36e 100644
--- a/drivers/net/virtio/virtio_rxtx_packed.c
+++ b/drivers/net/virtio/virtio_rxtx_packed.c
@@ -18,6 +18,8 @@
 
 #ifdef CC_AVX512_SUPPORT
 #include "virtio_rxtx_packed_avx.h"
+#elif defined(RTE_ARCH_ARM)
+#include "virtio_rxtx_packed_neon.h"
 #endif
 
 uint16_t
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 40345193e..241808cd8 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -856,7 +856,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
 
 	if (vectorized) {
 		if (packed_vq) {
-#if defined(CC_AVX512_SUPPORT)
+#if defined(CC_AVX512_SUPPORT) || defined(RTE_ARCH_ARM)
 			hw->use_vec_rx = 1;
 			hw->use_vec_tx = 1;
 #else
-- 
2.28.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
@ 2021-01-05 14:06     ` Maxime Coquelin
  0 siblings, 0 replies; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-05 14:06 UTC (permalink / raw)
  To: Joyce Kong, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd



On 11/17/20 11:06 AM, Joyce Kong wrote:
> Split out AVX instruction based virtio packed ring Rx and Tx
> implementation to a separate file.
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  drivers/net/virtio/meson.build              |   4 +-
>  drivers/net/virtio/virtio_rxtx.c            |   6 +-
>  drivers/net/virtio/virtio_rxtx_packed.c     | 137 +++++
>  drivers/net/virtio/virtio_rxtx_packed.h     | 298 ++++++++++
>  drivers/net/virtio/virtio_rxtx_packed_avx.c | 626 --------------------
>  drivers/net/virtio/virtio_rxtx_packed_avx.h | 239 ++++++++
>  6 files changed, 678 insertions(+), 632 deletions(-)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
>  delete mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.c
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.h
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
@ 2021-01-05 14:16     ` Maxime Coquelin
  2021-01-05 14:27       ` Maxime Coquelin
  2021-01-08 17:02     ` Ferruh Yigit
  1 sibling, 1 reply; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-05 14:16 UTC (permalink / raw)
  To: Joyce Kong, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd



On 11/17/20 11:06 AM, Joyce Kong wrote:
> Optimize packed ring Rx batch path with NEON instructions.
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
>  2 files changed, 165 insertions(+)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
> 
> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
> index b0b1d63ec..8f5198ad7 100644
> --- a/drivers/net/virtio/virtio_rxtx_packed.h
> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
> @@ -19,9 +19,16 @@
>  #include "virtqueue.h"
>  
>  #define BYTE_SIZE 8
> +
> +#ifdef CC_AVX512_SUPPORT
>  /* flag bits offset in packed ring desc higher 64bits */
>  #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> +#elif defined(RTE_ARCH_ARM)
> +/* flag bits offset in packed ring desc from ID */
> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
> +#endif
>  
>  #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
>  	FLAGS_BITS_OFFSET)
> @@ -44,8 +51,16 @@
>  /* net hdr short size mask */
>  #define NET_HDR_MASK 0x3F
>  
> +#ifdef RTE_ARCH_ARM
> +/* The cache line size on different Arm platforms are different, so
> + * put a four batch size here to match with the minimum cache line
> + * size and accommodate NEON register size.
> + */
> +#define PACKED_BATCH_SIZE 4
> +#else
>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>  	sizeof(struct vring_packed_desc))
> +#endif
>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>  
>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
> new file mode 100644
> index 000000000..fb1e49909
> --- /dev/null
> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
> @@ -0,0 +1,150 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Arm Corporation
> + */
> +
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <rte_net.h>
> +#include <rte_vect.h>
> +
> +#include "virtio_ethdev.h"
> +#include "virtio_pci.h"
> +#include "virtio_rxtx_packed.h"
> +#include "virtqueue.h"
> +
> +static inline uint16_t
> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
> +				   struct rte_mbuf **rx_pkts)
> +{
> +	struct virtqueue *vq = rxvq->vq;
> +	struct virtio_hw *hw = vq->hw;
> +	uint16_t head_size = hw->vtnet_hdr_size;
> +	uint16_t id = vq->vq_used_cons_idx;
> +	struct vring_packed_desc *p_desc;
> +	uint16_t i;
> +
> +	if (id & PACKED_BATCH_MASK)
> +		return -1;
> +
> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
> +		return -1;

This function returns an unsigned short, I think you should return 0
here since it failed to dequeue packets.

> +	/* Map packed descriptor to mbuf fields. */
> +	uint8x16_t shuf_msk1 = {
> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
> +		0, 1,			/* octet 1~0, 16 bits data_len */
> +		0xFF, 0xFF,		/* vlan tci set as unknown */
> +		0xFF, 0xFF, 0xFF, 0xFF
> +	};
> +
> +	uint8x16_t shuf_msk2 = {
> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
> +		8, 9,			/* octet 9~8, 16 bits data_len */
> +		0xFF, 0xFF,		/* vlan tci set as unknown */
> +		0xFF, 0xFF, 0xFF, 0xFF
> +	};
> +
> +	/* Subtract the header length. */
> +	uint16x8_t len_adjust = {
> +		0, 0,		/* ignore pkt_type field */
> +		head_size,	/* sub head_size on pkt_len */
> +		0,		/* ignore high 16 bits of pkt_len */
> +		head_size,	/* sub head_size on data_len */
> +		0, 0, 0		/* ignore non-length fields */
> +	};
> +
> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
> +
> +	p_desc = &vq->vq_packed.ring.desc[id];
> +	/* Load high 64 bits of packed descriptor 0,1. */
> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
> +	/* Load high 64 bits of packed descriptor 2,3. */
> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
> +
> +	/* Only care avail/used bits. */
> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
> +	/* Extract high 32 bits of packed descriptor (id, flags). */
> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
> +				vreinterpretq_u32_u64(desc[1]));
> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
> +
> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
> +	if (vq->vq_packed.used_wrap_counter)
> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
> +
> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
> +
> +	/* Check all descs are used. */
> +	if (desc_stats)
> +		return -1;

Same here. You should return 0 here as the queue is full.

> +
> +	/* Load 2 mbuf pointers per time. */
> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
> +
> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
> +
> +	/**
> +	 *  Update data length and packet length for descriptor.
> +	 *  structure of pkt_mb:
> +	 *  --------------------------------------------------------------------
> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
> +	 *  --------------------------------------------------------------------
> +	 */
> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
> +
> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
> +
> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
> +
> +	if (hw->has_rx_offload) {
> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			char *addr = (char *)rx_pkts[i]->buf_addr +
> +				RTE_PKTMBUF_HEADROOM - head_size;
> +			virtio_vec_rx_offload(rx_pkts[i],
> +					(struct virtio_net_hdr *)addr);
> +		}
> +	}
> +
> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
> +			rx_pkts[3]->pkt_len);
> +
> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
> +
> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> +		vq->vq_packed.used_wrap_counter ^= 1;
> +	}
> +
> +	return 0;
> +}
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2021-01-05 14:16     ` Maxime Coquelin
@ 2021-01-05 14:27       ` Maxime Coquelin
  2021-01-07 10:39         ` Maxime Coquelin
  0 siblings, 1 reply; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-05 14:27 UTC (permalink / raw)
  To: Joyce Kong, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd



On 1/5/21 3:16 PM, Maxime Coquelin wrote:
> 
> 
> On 11/17/20 11:06 AM, Joyce Kong wrote:
>> Optimize packed ring Rx batch path with NEON instructions.
>>
>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
>>  2 files changed, 165 insertions(+)
>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
>>
>> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
>> index b0b1d63ec..8f5198ad7 100644
>> --- a/drivers/net/virtio/virtio_rxtx_packed.h
>> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
>> @@ -19,9 +19,16 @@
>>  #include "virtqueue.h"
>>  
>>  #define BYTE_SIZE 8
>> +
>> +#ifdef CC_AVX512_SUPPORT
>>  /* flag bits offset in packed ring desc higher 64bits */
>>  #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>> +#elif defined(RTE_ARCH_ARM)
>> +/* flag bits offset in packed ring desc from ID */
>> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
>> +#endif
>>  
>>  #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
>>  	FLAGS_BITS_OFFSET)
>> @@ -44,8 +51,16 @@
>>  /* net hdr short size mask */
>>  #define NET_HDR_MASK 0x3F
>>  
>> +#ifdef RTE_ARCH_ARM
>> +/* The cache line size on different Arm platforms are different, so
>> + * put a four batch size here to match with the minimum cache line
>> + * size and accommodate NEON register size.
>> + */
>> +#define PACKED_BATCH_SIZE 4
>> +#else
>>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>>  	sizeof(struct vring_packed_desc))
>> +#endif
>>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>>  
>>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>> new file mode 100644
>> index 000000000..fb1e49909
>> --- /dev/null
>> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>> @@ -0,0 +1,150 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2020 Arm Corporation
>> + */
>> +
>> +#include <stdlib.h>
>> +#include <stdint.h>
>> +#include <stdio.h>
>> +#include <string.h>
>> +#include <errno.h>
>> +
>> +#include <rte_net.h>
>> +#include <rte_vect.h>
>> +
>> +#include "virtio_ethdev.h"
>> +#include "virtio_pci.h"
>> +#include "virtio_rxtx_packed.h"
>> +#include "virtqueue.h"
>> +
>> +static inline uint16_t
>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>> +				   struct rte_mbuf **rx_pkts)
>> +{
>> +	struct virtqueue *vq = rxvq->vq;
>> +	struct virtio_hw *hw = vq->hw;
>> +	uint16_t head_size = hw->vtnet_hdr_size;
>> +	uint16_t id = vq->vq_used_cons_idx;
>> +	struct vring_packed_desc *p_desc;
>> +	uint16_t i;
>> +
>> +	if (id & PACKED_BATCH_MASK)
>> +		return -1;
>> +
>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
>> +		return -1;
> 
> This function returns an unsigned short, I think you should return 0
> here since it failed to dequeue packets.
> 
>> +	/* Map packed descriptor to mbuf fields. */
>> +	uint8x16_t shuf_msk1 = {
>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>> +		0, 1,			/* octet 1~0, 16 bits data_len */
>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>> +		0xFF, 0xFF, 0xFF, 0xFF
>> +	};
>> +
>> +	uint8x16_t shuf_msk2 = {
>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>> +		8, 9,			/* octet 9~8, 16 bits data_len */
>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>> +		0xFF, 0xFF, 0xFF, 0xFF
>> +	};
>> +
>> +	/* Subtract the header length. */
>> +	uint16x8_t len_adjust = {
>> +		0, 0,		/* ignore pkt_type field */
>> +		head_size,	/* sub head_size on pkt_len */
>> +		0,		/* ignore high 16 bits of pkt_len */
>> +		head_size,	/* sub head_size on data_len */
>> +		0, 0, 0		/* ignore non-length fields */
>> +	};
>> +
>> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
>> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
>> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
>> +
>> +	p_desc = &vq->vq_packed.ring.desc[id];
>> +	/* Load high 64 bits of packed descriptor 0,1. */
>> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
>> +	/* Load high 64 bits of packed descriptor 2,3. */
>> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
>> +
>> +	/* Only care avail/used bits. */
>> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
>> +	/* Extract high 32 bits of packed descriptor (id, flags). */
>> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
>> +				vreinterpretq_u32_u64(desc[1]));
>> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
>> +
>> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
>> +	if (vq->vq_packed.used_wrap_counter)
>> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
>> +
>> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
>> +
>> +	/* Check all descs are used. */
>> +	if (desc_stats)
>> +		return -1;
> 
> Same here. You should return 0 here as the queue is full.

Just looked again at the code and at AVX implementation.
It should not return 0 here, but any positive value.

Maybe the cleanest way would change the function prototype to int.
0: success
-1: failure

>> +
>> +	/* Load 2 mbuf pointers per time. */
>> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
>> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
>> +
>> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
>> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
>> +
>> +	/**
>> +	 *  Update data length and packet length for descriptor.
>> +	 *  structure of pkt_mb:
>> +	 *  --------------------------------------------------------------------
>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>> +	 *  --------------------------------------------------------------------
>> +	 */
>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
>> +
>> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
>> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
>> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
>> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
>> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
>> +
>> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
>> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
>> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
>> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
>> +
>> +	if (hw->has_rx_offload) {
>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
>> +				RTE_PKTMBUF_HEADROOM - head_size;
>> +			virtio_vec_rx_offload(rx_pkts[i],
>> +					(struct virtio_net_hdr *)addr);
>> +		}
>> +	}
>> +
>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
>> +			rx_pkts[3]->pkt_len);
>> +
>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
>> +
>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>> +		vq->vq_packed.used_wrap_counter ^= 1;
>> +	}
>> +
>> +	return 0;
>> +}
>>
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx NEON path
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx " Joyce Kong
@ 2021-01-05 14:33     ` Maxime Coquelin
  0 siblings, 0 replies; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-05 14:33 UTC (permalink / raw)
  To: Joyce Kong, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd



On 11/17/20 11:06 AM, Joyce Kong wrote:
> Optimize packed ring Tx batch path with NEON instructions.
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  drivers/net/virtio/virtio_rxtx_packed.h      |   6 +-
>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 143 +++++++++++++++++++
>  2 files changed, 148 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
> index 8f5198ad7..016b6fb24 100644
> --- a/drivers/net/virtio/virtio_rxtx_packed.h
> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
> @@ -28,6 +28,8 @@
>  /* flag bits offset in packed ring desc from ID */
>  #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>  	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
> +#define FLAGS_LEN_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>  #endif
>  
>  #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
> @@ -36,13 +38,15 @@
>  /* reference count offset in mbuf rearm data */
>  #define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
>  	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> +
> +#ifdef CC_AVX512_SUPPORT
>  /* segment number offset in mbuf rearm data */
>  #define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
>  	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> -
>  /* default rearm data */
>  #define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
>  	1ULL << REFCNT_BITS_OFFSET)
> +#endif
>  
>  /* id bits offset in packed ring desc higher 64bits */
>  #define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \
> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
> index fb1e49909..041f771ea 100644
> --- a/drivers/net/virtio/virtio_rxtx_packed_neon.h
> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
> @@ -16,6 +16,149 @@
>  #include "virtio_rxtx_packed.h"
>  #include "virtqueue.h"
>  
> +static inline int
> +virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq,
> +				   struct rte_mbuf **tx_pkts)
> +{
> +	struct virtqueue *vq = txvq->vq;
> +	uint16_t head_size = vq->hw->vtnet_hdr_size;
> +	uint16_t idx = vq->vq_avail_idx;
> +	struct virtio_net_hdr *hdr;
> +	struct vq_desc_extra *dxp;
> +	struct vring_packed_desc *p_desc;
> +	uint16_t i;
> +
> +	if (idx & PACKED_BATCH_MASK)
> +		return -1;
> +
> +	if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
> +		return -1;
> +
> +	/* Map four refcnt and nb_segs from mbufs to one NEON register. */
> +	uint8x16_t ref_seg_msk = {
> +		2, 3, 4, 5,
> +		10, 11, 12, 13,
> +		18, 19, 20, 21,
> +		26, 27, 28, 29
> +	};
> +
> +	/* Map four data_off from mbufs to one NEON register. */
> +	uint8x8_t data_msk = {
> +		0, 1,
> +		8, 9,
> +		16, 17,
> +		24, 25
> +	};
> +
> +	uint16x8_t net_hdr_msk = {
> +		0xFFFF, 0xFFFF,
> +		0, 0, 0, 0
> +	};
> +
> +	uint16x4_t pkts[PACKED_BATCH_SIZE];
> +	uint8x16x2_t mbuf;
> +	/* Load four mbufs rearm data. */
> +	RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64);
> +	pkts[0] = vld1_u16((uint16_t *)&tx_pkts[0]->rearm_data);
> +	pkts[1] = vld1_u16((uint16_t *)&tx_pkts[1]->rearm_data);
> +	pkts[2] = vld1_u16((uint16_t *)&tx_pkts[2]->rearm_data);
> +	pkts[3] = vld1_u16((uint16_t *)&tx_pkts[3]->rearm_data);
> +
> +	mbuf.val[0] = vreinterpretq_u8_u16(vcombine_u16(pkts[0], pkts[1]));
> +	mbuf.val[1] = vreinterpretq_u8_u16(vcombine_u16(pkts[2], pkts[3]));
> +
> +	/* refcnt = 1 and nb_segs = 1 */
> +	uint32x4_t def_ref_seg = vdupq_n_u32(0x10001);
> +	/* Check refcnt and nb_segs. */
> +	uint32x4_t ref_seg = vreinterpretq_u32_u8(vqtbl2q_u8(mbuf, ref_seg_msk));
> +	poly128_t cmp1 = vreinterpretq_p128_u32(~vceqq_u32(ref_seg, def_ref_seg));
> +	if (unlikely(cmp1))
> +		return -1;
> +
> +	/* Check headroom is enough. */
> +	uint16x4_t head_rooms = vdup_n_u16(head_size);
> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) !=
> +			 offsetof(struct rte_mbuf, rearm_data));
> +	uint16x4_t data_offset = vreinterpret_u16_u8(vqtbl2_u8(mbuf, data_msk));
> +	uint64x1_t cmp2 = vreinterpret_u64_u16(vclt_u16(data_offset, head_rooms));
> +	if (unlikely(vget_lane_u64(cmp2, 0)))
> +		return -1;
> +
> +	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		dxp = &vq->vq_descx[idx + i];
> +		dxp->ndescs = 1;
> +		dxp->cookie = tx_pkts[i];
> +	}
> +
> +	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		tx_pkts[i]->data_off -= head_size;
> +		tx_pkts[i]->data_len += head_size;
> +	}
> +
> +	uint64x2x2_t desc[PACKED_BATCH_SIZE / 2];
> +	uint64x2_t base_addr0 = {
> +		VIRTIO_MBUF_ADDR(tx_pkts[0], vq) + tx_pkts[0]->data_off,
> +		VIRTIO_MBUF_ADDR(tx_pkts[1], vq) + tx_pkts[1]->data_off
> +	};
> +	uint64x2_t base_addr1 = {
> +		VIRTIO_MBUF_ADDR(tx_pkts[2], vq) + tx_pkts[2]->data_off,
> +		VIRTIO_MBUF_ADDR(tx_pkts[3], vq) + tx_pkts[3]->data_off
> +	};
> +
> +	desc[0].val[0] = base_addr0;
> +	desc[1].val[0] = base_addr1;
> +
> +	uint64_t flags = (uint64_t)vq->vq_packed.cached_flags << FLAGS_LEN_BITS_OFFSET;
> +	uint64x2_t tx_desc0 = {
> +		flags | (uint64_t)idx << ID_BITS_OFFSET | tx_pkts[0]->data_len,
> +		flags | (uint64_t)(idx + 1) << ID_BITS_OFFSET | tx_pkts[1]->data_len
> +	};
> +
> +	uint64x2_t tx_desc1 = {
> +		flags | (uint64_t)(idx + 2) << ID_BITS_OFFSET | tx_pkts[2]->data_len,
> +		flags | (uint64_t)(idx + 3) << ID_BITS_OFFSET | tx_pkts[3]->data_len
> +	};
> +
> +	desc[0].val[1] = tx_desc0;
> +	desc[1].val[1] = tx_desc1;
> +
> +	if (!vq->hw->has_tx_offload) {
> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
> +					struct virtio_net_hdr *, -head_size);
> +			/* Clear net hdr. */
> +			uint16x8_t v_hdr = vld1q_u16((void *)hdr);
> +			vst1q_u16((void *)hdr, vandq_u16(v_hdr, net_hdr_msk));
> +		}
> +	} else {
> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			hdr = rte_pktmbuf_mtod_offset(tx_pkts[i],
> +					struct virtio_net_hdr *, -head_size);
> +			virtqueue_xmit_offload(hdr, tx_pkts[i], true);
> +		}
> +	}
> +
> +	/* Enqueue packet buffers. */
> +	p_desc = &vq->vq_packed.ring.desc[idx];
> +	vst2q_u64((uint64_t *)p_desc, desc[0]);
> +	vst2q_u64((uint64_t *)(p_desc + 2), desc[1]);
> +
> +	virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len,
> +			tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len,
> +			tx_pkts[3]->pkt_len);
> +
> +	vq->vq_avail_idx += PACKED_BATCH_SIZE;
> +	vq->vq_free_cnt -= PACKED_BATCH_SIZE;
> +
> +	if (vq->vq_avail_idx >= vq->vq_nentries) {
> +		vq->vq_avail_idx -= vq->vq_nentries;
> +		vq->vq_packed.cached_flags ^=
> +			VRING_PACKED_DESC_F_AVAIL_USED;
> +	}
> +
> +	return 0;
> +}
> +
>  static inline uint16_t
>  virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>  				   struct rte_mbuf **rx_pkts)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector NEON path
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector " Joyce Kong
@ 2021-01-05 14:42     ` Maxime Coquelin
  0 siblings, 0 replies; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-05 14:42 UTC (permalink / raw)
  To: Joyce Kong, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd



On 11/17/20 11:06 AM, Joyce Kong wrote:
> Add NEON vectorized path selection logic. Default setting comes from
> vectorized devarg, then checks each criteria.
> 
> Packed ring vectorized neon path need:
>     NEON is supported by compiler and host
>     VERSION_1 and IN_ORDER features are negotiated
>     mergeable feature is not negotiated
>     LRO offloading is disabled
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  doc/guides/nics/virtio.rst              |  6 +++---
>  drivers/net/virtio/meson.build          |  1 +
>  drivers/net/virtio/virtio_ethdev.c      | 19 +++++++++++++++----
>  drivers/net/virtio/virtio_rxtx_packed.c |  2 ++
>  drivers/net/virtio/virtio_user_ethdev.c |  2 +-
>  5 files changed, 22 insertions(+), 8 deletions(-)
> 

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2021-01-05 14:27       ` Maxime Coquelin
@ 2021-01-07 10:39         ` Maxime Coquelin
  2021-01-08  7:29           ` Joyce Kong
  0 siblings, 1 reply; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-07 10:39 UTC (permalink / raw)
  To: Joyce Kong, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd



On 1/5/21 3:27 PM, Maxime Coquelin wrote:
> 
> 
> On 1/5/21 3:16 PM, Maxime Coquelin wrote:
>>
>>
>> On 11/17/20 11:06 AM, Joyce Kong wrote:
>>> Optimize packed ring Rx batch path with NEON instructions.
>>>
>>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>>> ---
>>>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>>>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++
>>>  2 files changed, 165 insertions(+)
>>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>
>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h
>>> index b0b1d63ec..8f5198ad7 100644
>>> --- a/drivers/net/virtio/virtio_rxtx_packed.h
>>> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
>>> @@ -19,9 +19,16 @@
>>>  #include "virtqueue.h"
>>>  
>>>  #define BYTE_SIZE 8
>>> +
>>> +#ifdef CC_AVX512_SUPPORT
>>>  /* flag bits offset in packed ring desc higher 64bits */
>>>  #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>>> +#elif defined(RTE_ARCH_ARM)
>>> +/* flag bits offset in packed ring desc from ID */
>>> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE)
>>> +#endif
>>>  
>>>  #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
>>>  	FLAGS_BITS_OFFSET)
>>> @@ -44,8 +51,16 @@
>>>  /* net hdr short size mask */
>>>  #define NET_HDR_MASK 0x3F
>>>  
>>> +#ifdef RTE_ARCH_ARM
>>> +/* The cache line size on different Arm platforms are different, so
>>> + * put a four batch size here to match with the minimum cache line
>>> + * size and accommodate NEON register size.
>>> + */
>>> +#define PACKED_BATCH_SIZE 4
>>> +#else
>>>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>>>  	sizeof(struct vring_packed_desc))
>>> +#endif
>>>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>>>  
>>>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>> new file mode 100644
>>> index 000000000..fb1e49909
>>> --- /dev/null
>>> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>> @@ -0,0 +1,150 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright(c) 2020 Arm Corporation
>>> + */
>>> +
>>> +#include <stdlib.h>
>>> +#include <stdint.h>
>>> +#include <stdio.h>
>>> +#include <string.h>
>>> +#include <errno.h>
>>> +
>>> +#include <rte_net.h>
>>> +#include <rte_vect.h>
>>> +
>>> +#include "virtio_ethdev.h"
>>> +#include "virtio_pci.h"
>>> +#include "virtio_rxtx_packed.h"
>>> +#include "virtqueue.h"
>>> +
>>> +static inline uint16_t
>>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>>> +				   struct rte_mbuf **rx_pkts)
>>> +{
>>> +	struct virtqueue *vq = rxvq->vq;
>>> +	struct virtio_hw *hw = vq->hw;
>>> +	uint16_t head_size = hw->vtnet_hdr_size;
>>> +	uint16_t id = vq->vq_used_cons_idx;
>>> +	struct vring_packed_desc *p_desc;
>>> +	uint16_t i;
>>> +
>>> +	if (id & PACKED_BATCH_MASK)
>>> +		return -1;
>>> +
>>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
>>> +		return -1;
>>
>> This function returns an unsigned short, I think you should return 0
>> here since it failed to dequeue packets.
>>
>>> +	/* Map packed descriptor to mbuf fields. */
>>> +	uint8x16_t shuf_msk1 = {
>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>>> +		0, 1,			/* octet 1~0, 16 bits data_len */
>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>> +	};
>>> +
>>> +	uint8x16_t shuf_msk2 = {
>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out */
>>> +		8, 9,			/* octet 9~8, 16 bits data_len */
>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>> +	};
>>> +
>>> +	/* Subtract the header length. */
>>> +	uint16x8_t len_adjust = {
>>> +		0, 0,		/* ignore pkt_type field */
>>> +		head_size,	/* sub head_size on pkt_len */
>>> +		0,		/* ignore high 16 bits of pkt_len */
>>> +		head_size,	/* sub head_size on data_len */
>>> +		0, 0, 0		/* ignore non-length fields */
>>> +	};
>>> +
>>> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
>>> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
>>> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
>>> +
>>> +	p_desc = &vq->vq_packed.ring.desc[id];
>>> +	/* Load high 64 bits of packed descriptor 0,1. */
>>> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
>>> +	/* Load high 64 bits of packed descriptor 2,3. */
>>> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
>>> +
>>> +	/* Only care avail/used bits. */
>>> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
>>> +	/* Extract high 32 bits of packed descriptor (id, flags). */
>>> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
>>> +				vreinterpretq_u32_u64(desc[1]));
>>> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
>>> +
>>> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
>>> +	if (vq->vq_packed.used_wrap_counter)
>>> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
>>> +
>>> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag));
>>> +
>>> +	/* Check all descs are used. */
>>> +	if (desc_stats)
>>> +		return -1;
>>
>> Same here. You should return 0 here as the queue is full.
> 
> Just looked again at the code and at AVX implementation.
> It should not return 0 here, but any positive value.
> 
> Maybe the cleanest way would change the function prototype to int.
> 0: success
> -1: failure


Joyce, are you fine if I do the cange while applying?
I have a big series that will conflicts with your patch set, so I'd like
to have yours merged ASAP so I can start the rebase.

Thanks,
Maxime

>>> +
>>> +	/* Load 2 mbuf pointers per time. */
>>> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
>>> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
>>> +
>>> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
>>> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
>>> +
>>> +	/**
>>> +	 *  Update data length and packet length for descriptor.
>>> +	 *  structure of pkt_mb:
>>> +	 *  --------------------------------------------------------------------
>>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>>> +	 *  --------------------------------------------------------------------
>>> +	 */
>>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>>> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
>>> +
>>> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
>>> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
>>> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
>>> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
>>> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
>>> +
>>> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
>>> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
>>> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
>>> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
>>> +
>>> +	if (hw->has_rx_offload) {
>>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
>>> +				RTE_PKTMBUF_HEADROOM - head_size;
>>> +			virtio_vec_rx_offload(rx_pkts[i],
>>> +					(struct virtio_net_hdr *)addr);
>>> +		}
>>> +	}
>>> +
>>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
>>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
>>> +			rx_pkts[3]->pkt_len);
>>> +
>>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
>>> +
>>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
>>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>>> +		vq->vq_packed.used_wrap_counter ^= 1;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>>
>>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2021-01-07 10:39         ` Maxime Coquelin
@ 2021-01-08  7:29           ` Joyce Kong
  0 siblings, 0 replies; 26+ messages in thread
From: Joyce Kong @ 2021-01-08  7:29 UTC (permalink / raw)
  To: Maxime Coquelin, chenbo.xia, jerinj, Ruifeng Wang, Honnappa Nagarahalli
  Cc: dev, nd

>On 1/5/21 3:27 PM, Maxime Coquelin wrote:
>>
>>
>> On 1/5/21 3:16 PM, Maxime Coquelin wrote:
>>>
>>>
>>> On 11/17/20 11:06 AM, Joyce Kong wrote:
>>>> Optimize packed ring Rx batch path with NEON instructions.
>>>>
>>>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>>>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>>>> ---
>>>>  drivers/net/virtio/virtio_rxtx_packed.h      |  15 ++
>>>>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 150
>>>> +++++++++++++++++++
>>>>  2 files changed, 165 insertions(+)
>>>>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>>
>>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed.h
>>>> b/drivers/net/virtio/virtio_rxtx_packed.h
>>>> index b0b1d63ec..8f5198ad7 100644
>>>> --- a/drivers/net/virtio/virtio_rxtx_packed.h
>>>> +++ b/drivers/net/virtio/virtio_rxtx_packed.h
>>>> @@ -19,9 +19,16 @@
>>>>  #include "virtqueue.h"
>>>>
>>>>  #define BYTE_SIZE 8
>>>> +
>>>> +#ifdef CC_AVX512_SUPPORT
>>>>  /* flag bits offset in packed ring desc higher 64bits */  #define
>>>> FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>>>  	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>>>> +#elif defined(RTE_ARCH_ARM)
>>>> +/* flag bits offset in packed ring desc from ID */ #define
>>>> +FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>>> +	offsetof(struct vring_packed_desc, id)) * BYTE_SIZE) #endif
>>>>
>>>>  #define PACKED_FLAGS_MASK ((0ULL |
>VRING_PACKED_DESC_F_AVAIL_USED) << \
>>>>  	FLAGS_BITS_OFFSET)
>>>> @@ -44,8 +51,16 @@
>>>>  /* net hdr short size mask */
>>>>  #define NET_HDR_MASK 0x3F
>>>>
>>>> +#ifdef RTE_ARCH_ARM
>>>> +/* The cache line size on different Arm platforms are different, so
>>>> + * put a four batch size here to match with the minimum cache line
>>>> + * size and accommodate NEON register size.
>>>> + */
>>>> +#define PACKED_BATCH_SIZE 4
>>>> +#else
>>>>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>>>>  	sizeof(struct vring_packed_desc))
>>>> +#endif
>>>>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>>>>
>>>>  #ifdef VIRTIO_GCC_UNROLL_PRAGMA
>>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>> b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>> new file mode 100644
>>>> index 000000000..fb1e49909
>>>> --- /dev/null
>>>> +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h
>>>> @@ -0,0 +1,150 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2020 Arm Corporation  */
>>>> +
>>>> +#include <stdlib.h>
>>>> +#include <stdint.h>
>>>> +#include <stdio.h>
>>>> +#include <string.h>
>>>> +#include <errno.h>
>>>> +
>>>> +#include <rte_net.h>
>>>> +#include <rte_vect.h>
>>>> +
>>>> +#include "virtio_ethdev.h"
>>>> +#include "virtio_pci.h"
>>>> +#include "virtio_rxtx_packed.h"
>>>> +#include "virtqueue.h"
>>>> +
>>>> +static inline uint16_t
>>>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>>>> +				   struct rte_mbuf **rx_pkts)
>>>> +{
>>>> +	struct virtqueue *vq = rxvq->vq;
>>>> +	struct virtio_hw *hw = vq->hw;
>>>> +	uint16_t head_size = hw->vtnet_hdr_size;
>>>> +	uint16_t id = vq->vq_used_cons_idx;
>>>> +	struct vring_packed_desc *p_desc;
>>>> +	uint16_t i;
>>>> +
>>>> +	if (id & PACKED_BATCH_MASK)
>>>> +		return -1;
>>>> +
>>>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
>>>> +		return -1;
>>>
>>> This function returns an unsigned short, I think you should return 0
>>> here since it failed to dequeue packets.
>>>
>>>> +	/* Map packed descriptor to mbuf fields. */
>>>> +	uint8x16_t shuf_msk1 = {
>>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>>> +		0, 1,			/* octet 1~0, low 16 bits pkt_len */
>>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out
>*/
>>>> +		0, 1,			/* octet 1~0, 16 bits data_len */
>>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>>> +	};
>>>> +
>>>> +	uint8x16_t shuf_msk2 = {
>>>> +		0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */
>>>> +		8, 9,			/* octet 9~8, low 16 bits pkt_len */
>>>> +		0xFF, 0xFF,		/* skip high 16 bits of pkt_len, zero out
>*/
>>>> +		8, 9,			/* octet 9~8, 16 bits data_len */
>>>> +		0xFF, 0xFF,		/* vlan tci set as unknown */
>>>> +		0xFF, 0xFF, 0xFF, 0xFF
>>>> +	};
>>>> +
>>>> +	/* Subtract the header length. */
>>>> +	uint16x8_t len_adjust = {
>>>> +		0, 0,		/* ignore pkt_type field */
>>>> +		head_size,	/* sub head_size on pkt_len */
>>>> +		0,		/* ignore high 16 bits of pkt_len */
>>>> +		head_size,	/* sub head_size on data_len */
>>>> +		0, 0, 0		/* ignore non-length fields */
>>>> +	};
>>>> +
>>>> +	uint64x2_t desc[PACKED_BATCH_SIZE / 2];
>>>> +	uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2];
>>>> +	uint64x2_t pkt_mb[PACKED_BATCH_SIZE];
>>>> +
>>>> +	p_desc = &vq->vq_packed.ring.desc[id];
>>>> +	/* Load high 64 bits of packed descriptor 0,1. */
>>>> +	desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1];
>>>> +	/* Load high 64 bits of packed descriptor 2,3. */
>>>> +	desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1];
>>>> +
>>>> +	/* Only care avail/used bits. */
>>>> +	uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK);
>>>> +	/* Extract high 32 bits of packed descriptor (id, flags). */
>>>> +	uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]),
>>>> +				vreinterpretq_u32_u64(desc[1]));
>>>> +	uint32x4_t v_flag = vandq_u32(v_desc, v_mask);
>>>> +
>>>> +	uint32x4_t v_used_flag = vdupq_n_u32(0);
>>>> +	if (vq->vq_packed.used_wrap_counter)
>>>> +		v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK);
>>>> +
>>>> +	poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag,
>>>> +v_used_flag));
>>>> +
>>>> +	/* Check all descs are used. */
>>>> +	if (desc_stats)
>>>> +		return -1;
>>>
>>> Same here. You should return 0 here as the queue is full.
>>
>> Just looked again at the code and at AVX implementation.
>> It should not return 0 here, but any positive value.
>>
>> Maybe the cleanest way would change the function prototype to int.
>> 0: success
>> -1: failure
>
>
>Joyce, are you fine if I do the cange while applying?
>I have a big series that will conflicts with your patch set, so I'd like to have
>yours merged ASAP so I can start the rebase.
>
>Thanks,
>Maxime
>

Maxime, It's ok if you would do the change while applying.

Thanks,
Joyce
 
>>>> +
>>>> +	/* Load 2 mbuf pointers per time. */
>>>> +	mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]);
>>>> +	vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]);
>>>> +
>>>> +	mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]);
>>>> +	vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]);
>>>> +
>>>> +	/**
>>>> +	 *  Update data length and packet length for descriptor.
>>>> +	 *  structure of pkt_mb:
>>>> +	 *  --------------------------------------------------------------------
>>>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>>>> +	 *  --------------------------------------------------------------------
>>>> +	 */
>>>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>>>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>>>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>>>> +	pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
>>>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk2));
>>>> +
>>>> +	pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
>>>> +	pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
>>>> +	pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
>>>> +	pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
>>>> +			vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
>>>> +
>>>> +	vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]);
>>>> +	vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]);
>>>> +	vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]);
>>>> +	vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]);
>>>> +
>>>> +	if (hw->has_rx_offload) {
>>>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>>>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
>>>> +				RTE_PKTMBUF_HEADROOM - head_size;
>>>> +			virtio_vec_rx_offload(rx_pkts[i],
>>>> +					(struct virtio_net_hdr *)addr);
>>>> +		}
>>>> +	}
>>>> +
>>>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
>>>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
>>>> +			rx_pkts[3]->pkt_len);
>>>> +
>>>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
>>>> +
>>>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
>>>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>>>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>>>> +		vq->vq_packed.used_wrap_counter ^= 1;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>>
>>>


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX path with NEON
  2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
                     ` (3 preceding siblings ...)
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector " Joyce Kong
@ 2021-01-08  9:11   ` Maxime Coquelin
  4 siblings, 0 replies; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-08  9:11 UTC (permalink / raw)
  To: Joyce Kong, chenbo.xia, jerinj, ruifeng.wang, honnappa.nagarahalli
  Cc: dev, nd



On 11/17/20 11:06 AM, Joyce Kong wrote:
> This patch set introduces vectorized RX/TX path for packed ring with NEON
> intrinsics.
> 
> With this patch set, PVP case has 1.5% perf uplift for the packed vectorized
> NEON path compared with the non-vector packed path, under 0.001% acceptable
> loss with 2 cores on vhost side and 1 core on virtio side.
> 
> Joyce Kong (4):
>   net/virtio: move AVX based Rx and Tx code to separate file
>   net/virtio: add vectorized packed ring Rx NEON path
>   net/virtio: add vectorized packed ring Tx NEON path
>   net/virtio: add election for packed vector NEON path
> 
>  doc/guides/nics/virtio.rst                   |   6 +-
>  drivers/net/virtio/meson.build               |   5 +-
>  drivers/net/virtio/virtio_ethdev.c           |  19 +-
>  drivers/net/virtio/virtio_rxtx.c             |   6 +-
>  drivers/net/virtio/virtio_rxtx_packed.c      | 139 ++++
>  drivers/net/virtio/virtio_rxtx_packed.h      | 317 ++++++++++
>  drivers/net/virtio/virtio_rxtx_packed_avx.c  | 626 -------------------
>  drivers/net/virtio/virtio_rxtx_packed_avx.h  | 239 +++++++
>  drivers/net/virtio/virtio_rxtx_packed_neon.h | 293 +++++++++
>  drivers/net/virtio/virtio_user_ethdev.c      |   2 +-
>  10 files changed, 1012 insertions(+), 640 deletions(-)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h
>  delete mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.c
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.h
>  create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h
> 

Series applied to dpdk-next-virtio/main.

Thanks,
Maxime


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
  2021-01-05 14:16     ` Maxime Coquelin
@ 2021-01-08 17:02     ` Ferruh Yigit
  2021-01-08 22:26       ` Honnappa Nagarahalli
                         ` (2 more replies)
  1 sibling, 3 replies; 26+ messages in thread
From: Ferruh Yigit @ 2021-01-08 17:02 UTC (permalink / raw)
  To: Joyce Kong, maxime.coquelin, chenbo.xia, jerinj, ruifeng.wang,
	honnappa.nagarahalli
  Cc: dev, nd, David Marchand, Thomas Monjalon, dpdklab, Aaron Conole,
	Chen, Zhaoyan

On 11/17/2020 10:06 AM, Joyce Kong wrote:
> +	/**
> +	 *  Update data length and packet length for descriptor.
> +	 *  structure of pkt_mb:
> +	 *  --------------------------------------------------------------------
> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
> +	 *  --------------------------------------------------------------------
> +	 */
> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'

s\'\;

I will fix in next-net but my concern is why this has been not caught by any of 
our automated builds?

In patchwork only test report seems from the 'checkpatch':
https://patches.dpdk.org/patch/84260/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2021-01-08 17:02     ` Ferruh Yigit
@ 2021-01-08 22:26       ` Honnappa Nagarahalli
  2021-01-11 13:05         ` Aaron Conole
  2021-01-11 10:45       ` Maxime Coquelin
  2021-01-11 13:04       ` Aaron Conole
  2 siblings, 1 reply; 26+ messages in thread
From: Honnappa Nagarahalli @ 2021-01-08 22:26 UTC (permalink / raw)
  To: Ferruh Yigit, Joyce Kong, maxime.coquelin, chenbo.xia, jerinj,
	Ruifeng Wang
  Cc: dev, nd, David Marchand, thomas, dpdklab, Aaron Conole, Chen,
	Zhaoyan, Honnappa Nagarahalli, nd

<snip>

> 
> On 11/17/2020 10:06 AM, Joyce Kong wrote:
> > +	/**
> > +	 *  Update data length and packet length for descriptor.
> > +	 *  structure of pkt_mb:
> > +	 *  --------------------------------------------------------------------
> > +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits
> vlan_tci|
> > +	 *  --------------------------------------------------------------------
> > +	 */
> > +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
> > +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
> > +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
> > +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
> 
> s\'\;
> 
> I will fix in next-net but my concern is why this has been not caught by any of
> our automated builds?
> 
> In patchwork only test report seems from the 'checkpatch':
> https://patches.dpdk.org/patch/84260/
 
Looking at [1], Travis CI has not run and the UNH CI did not have Arm builds enabled at the time this patch was submitted.

[1] https://patches.dpdk.org/patch/84262/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2021-01-08 17:02     ` Ferruh Yigit
  2021-01-08 22:26       ` Honnappa Nagarahalli
@ 2021-01-11 10:45       ` Maxime Coquelin
  2021-01-11 13:04       ` Aaron Conole
  2 siblings, 0 replies; 26+ messages in thread
From: Maxime Coquelin @ 2021-01-11 10:45 UTC (permalink / raw)
  To: Ferruh Yigit, Joyce Kong, chenbo.xia, jerinj, ruifeng.wang,
	honnappa.nagarahalli
  Cc: dev, nd, David Marchand, Thomas Monjalon, dpdklab, Aaron Conole,
	Chen, Zhaoyan



On 1/8/21 6:02 PM, Ferruh Yigit wrote:
> On 11/17/2020 10:06 AM, Joyce Kong wrote:
>> +    /**
>> +     *  Update data length and packet length for descriptor.
>> +     *  structure of pkt_mb:
>> +     * 
>> --------------------------------------------------------------------
>> +     *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits
>> vlan_tci|
>> +     * 
>> --------------------------------------------------------------------
>> +     */
>> +    pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +            vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> +    pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +            vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> +    pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +            vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
> 
> s\'\;
> 
> I will fix in next-net but my concern is why this has been not caught by
> any of our automated builds?
> 
> In patchwork only test report seems from the 'checkpatch':
> https://patches.dpdk.org/patch/84260/
> 

Thanks Ferruh for spotting and fixing it.

I think the CI was broken at the time it was submitted, it would be
great to have a way to manually trigger the CI again!

That plus me changing laptop recently and not having a full multi-arch
build system up and running again made this build issue pass through...

Regards,
Maxime


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2021-01-08 17:02     ` Ferruh Yigit
  2021-01-08 22:26       ` Honnappa Nagarahalli
  2021-01-11 10:45       ` Maxime Coquelin
@ 2021-01-11 13:04       ` Aaron Conole
  2 siblings, 0 replies; 26+ messages in thread
From: Aaron Conole @ 2021-01-11 13:04 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Joyce Kong, maxime.coquelin, chenbo.xia, jerinj, ruifeng.wang,
	honnappa.nagarahalli, dev, nd, David Marchand, Thomas Monjalon,
	dpdklab, Chen, Zhaoyan

Ferruh Yigit <ferruh.yigit@intel.com> writes:

> On 11/17/2020 10:06 AM, Joyce Kong wrote:
>> +	/**
>> +	 *  Update data length and packet length for descriptor.
>> +	 *  structure of pkt_mb:
>> +	 *  --------------------------------------------------------------------
>> +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci|
>> +	 *  --------------------------------------------------------------------
>> +	 */
>> +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>
> s\'\;
>
> I will fix in next-net but my concern is why this has been not caught
> by any of our automated builds?

That series was flagged for error with Travis:

http://mails.dpdk.org/archives/test-report/2020-November/167602.html

Unfortunately, the build seems to have been purged (since it's from
November).  But Travis did flag the build as failing.  With github
actions we hope to pull the full logs into the email.

> In patchwork only test report seems from the 'checkpatch':
> https://patches.dpdk.org/patch/84260/

At least the 0-day robot does not submit each patch for separate build.
We did that at first, and the robot's queue reached a week of backlog
because the build takes a while.  Especially when we get 20+ patch
series followed by v2-v4 fixing build errors or compile errors.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path
  2021-01-08 22:26       ` Honnappa Nagarahalli
@ 2021-01-11 13:05         ` Aaron Conole
  0 siblings, 0 replies; 26+ messages in thread
From: Aaron Conole @ 2021-01-11 13:05 UTC (permalink / raw)
  To: Honnappa Nagarahalli
  Cc: Ferruh Yigit, Joyce Kong, maxime.coquelin, chenbo.xia, jerinj,
	Ruifeng Wang, dev, nd, David Marchand, thomas, dpdklab, Chen,
	Zhaoyan

Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> writes:

> <snip>
>
>> 
>> On 11/17/2020 10:06 AM, Joyce Kong wrote:
>> > +	/**
>> > +	 *  Update data length and packet length for descriptor.
>> > +	 *  structure of pkt_mb:
>> > +	 *  --------------------------------------------------------------------
>> > +	 *  |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits
>> vlan_tci|
>> > +	 *  --------------------------------------------------------------------
>> > +	 */
>> > +	pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk1));
>> > +	pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> > +			vreinterpretq_u8_u64(desc[0]), shuf_msk2));
>> > +	pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
>> > +			vreinterpretq_u8_u64(desc[1]), shuf_msk1))'
>> 
>> s\'\;
>> 
>> I will fix in next-net but my concern is why this has been not caught by any of
>> our automated builds?
>> 
>> In patchwork only test report seems from the 'checkpatch':
>> https://patches.dpdk.org/patch/84260/
>  
> Looking at [1], Travis CI has not run and the UNH CI did not have Arm
> builds enabled at the time this patch was submitted.

Seem my other email.  Travis CI doesn't run 'patch-at-a-time' for
execution time reasons.

> [1] https://patches.dpdk.org/patch/84262/


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2021-01-11 13:05 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-11 12:09 [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Joyce Kong
2020-09-11 12:09 ` [dpdk-dev] [RFC 1/3] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
2020-09-11 12:09 ` [dpdk-dev] [RFC 2/3] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
2020-09-11 12:09 ` [dpdk-dev] [RFC 3/3] net/virtio: add election for packed vector " Joyce Kong
2020-10-05  7:34 ` [dpdk-dev] [RFC 0/3] Vectorize packed ring RX path with NEON Maxime Coquelin
2020-10-08  6:54   ` Joyce Kong
2020-10-15  9:01   ` Ruifeng Wang
2020-10-15  9:02     ` Maxime Coquelin
2020-11-17 10:06 ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX " Joyce Kong
2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file Joyce Kong
2021-01-05 14:06     ` Maxime Coquelin
2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path Joyce Kong
2021-01-05 14:16     ` Maxime Coquelin
2021-01-05 14:27       ` Maxime Coquelin
2021-01-07 10:39         ` Maxime Coquelin
2021-01-08  7:29           ` Joyce Kong
2021-01-08 17:02     ` Ferruh Yigit
2021-01-08 22:26       ` Honnappa Nagarahalli
2021-01-11 13:05         ` Aaron Conole
2021-01-11 10:45       ` Maxime Coquelin
2021-01-11 13:04       ` Aaron Conole
2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx " Joyce Kong
2021-01-05 14:33     ` Maxime Coquelin
2020-11-17 10:06   ` [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector " Joyce Kong
2021-01-05 14:42     ` Maxime Coquelin
2021-01-08  9:11   ` [dpdk-dev] [PATCH v1 0/4] Vectorize packed ring RX/TX path with NEON Maxime Coquelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).