From: Wencheng Li <liwencheng@phytium.com.cn>
To: stephen@networkplumber.org
Cc: dev@dpdk.org
Subject: [PATCH v3 2/6] net/macb: add NEON vectorized Rx/Tx
Date: Tue, 10 Dec 2024 07:05:56 +0000 [thread overview]
Message-ID: <1733814356-412059-1-git-send-email-liwencheng@phytium.com.cn> (raw)
In-Reply-To: <1731310965-1743397-1-git-send-email-liwencheng@phytium.com.cn>
To optimize Rx/Tx burst process,
add NEON vector instructions on arm architecture.
Signed-off-by: Wencheng Li <liwencheng@phytium.com.cn>
---
drivers/net/macb/macb_rxtx.c | 2 +
drivers/net/macb/macb_rxtx_vec_neon.c | 672 ++++++++++++++++++++++++++++++++++
drivers/net/macb/meson.build | 4 +
3 files changed, 678 insertions(+)
create mode 100644 drivers/net/macb/macb_rxtx_vec_neon.c
diff --git a/drivers/net/macb/macb_rxtx.c b/drivers/net/macb/macb_rxtx.c
index 7104ec5..fa36a1e 100644
--- a/drivers/net/macb/macb_rxtx.c
+++ b/drivers/net/macb/macb_rxtx.c
@@ -1354,6 +1354,7 @@ int __rte_cold eth_macb_rx_init(struct rte_eth_dev *dev)
return 0;
}
+#if !defined(RTE_ARCH_ARM64)
uint16_t
eth_macb_recv_pkts_vec(void __rte_unused *rx_queue,
struct rte_mbuf __rte_unused **rx_pkts,
@@ -1377,3 +1378,4 @@ eth_macb_xmit_pkts_vec(void __rte_unused *tx_queue,
{
return 0;
}
+#endif
diff --git a/drivers/net/macb/macb_rxtx_vec_neon.c b/drivers/net/macb/macb_rxtx_vec_neon.c
new file mode 100644
index 0000000..7f02a57
--- /dev/null
+++ b/drivers/net/macb/macb_rxtx_vec_neon.c
@@ -0,0 +1,672 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Phytium Technology Co., Ltd.
+ */
+
+#include <rte_bus_vdev.h>
+#include <ethdev_driver.h>
+#include <rte_kvargs.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+#include <rte_vect.h>
+#include <stdint.h>
+
+#include <fcntl.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <net/if.h>
+#include <rte_ether.h>
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include "macb_rxtx.h"
+
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
+#define MACB_UINT8_BIT (CHAR_BIT * sizeof(uint8_t))
+
+#define MACB_DESC_EOF_MASK 0x80808080
+
+static inline uint32_t macb_get_packet_type(struct rte_mbuf *rxm)
+{
+ struct rte_ether_hdr *eth_hdr;
+ uint16_t ether_type;
+
+ eth_hdr = rte_pktmbuf_mtod(rxm, struct rte_ether_hdr *);
+ ether_type = eth_hdr->ether_type;
+
+ if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4))
+ return RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4;
+ else if (ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV6))
+ return RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6;
+ else
+ return RTE_PTYPE_UNKNOWN;
+}
+
+static inline uint8x8_t macb_mbuf_initializer(struct macb_rx_queue *rxq)
+{
+ struct rte_mbuf mbuf = {.buf_addr = 0}; /* zeroed mbuf */
+ uint64x1_t mbuf_initializer = vdup_n_u64(0);
+ uint8x8_t rearm_data_vec;
+
+ mbuf.data_off = RTE_PKTMBUF_HEADROOM + MACB_RX_DATA_OFFSET;
+ mbuf.nb_segs = 1;
+ mbuf.port = rxq->port_id;
+ rte_mbuf_refcnt_set(&mbuf, 1);
+
+ /* prevent compiler reordering: rearm_data covers previous fields */
+ rte_compiler_barrier();
+ mbuf_initializer =
+ vset_lane_u64(*(uint64_t *)(&mbuf.rearm_data), mbuf_initializer, 0);
+ rearm_data_vec = vld1_u8((uint8_t *)&mbuf_initializer);
+ return rearm_data_vec;
+}
+
+static inline void macb_rxq_rearm(struct macb_rx_queue *rxq)
+{
+ uint64_t dma_addr;
+ struct macb_dma_desc *desc;
+ unsigned int entry;
+ struct rte_mbuf *nmb;
+ struct macb *bp;
+ register int i = 0;
+ struct macb_rx_entry *rxe;
+
+ uint32x2_t zero = vdup_n_u32(0);
+ uint8x8_t rearm_data_vec;
+
+ bp = rxq->bp;
+ rxe = &rxq->rx_sw_ring[rxq->rxrearm_start];
+
+ entry = macb_rx_ring_wrap(bp, rxq->rxrearm_start);
+ desc = macb_rx_desc(rxq, entry);
+
+ rearm_data_vec = macb_mbuf_initializer(rxq);
+
+ /* Pull 'n' more MBUFs into the software ring */
+ if (unlikely(rte_mempool_get_bulk(rxq->mb_pool, (void *)rxe,
+ MACB_RXQ_REARM_THRESH) < 0)) {
+ if (rxq->rxrearm_nb + (unsigned int)MACB_RXQ_REARM_THRESH >=
+ rxq->nb_rx_desc) {
+ MACB_LOG(ERR, "allocate mbuf fail!\n");
+ for (i = 0; i < MACB_DESCS_PER_LOOP; i++) {
+ rxe[i].mbuf = &rxq->fake_mbuf;
+ vst1_u32((uint32_t *)&desc[MACB_DESC_ADDR_INTERVAL * i], zero);
+ }
+ }
+ rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+ MACB_RXQ_REARM_THRESH;
+ return;
+ }
+
+ for (i = 0; i < MACB_RXQ_REARM_THRESH; ++i) {
+ nmb = rxe[i].mbuf;
+ entry = macb_rx_ring_wrap(bp, rxq->rxrearm_start);
+ desc = macb_rx_desc(rxq, entry);
+ rxq->rxrearm_start++;
+ vst1_u8((uint8_t *)&nmb->rearm_data, rearm_data_vec);
+ dma_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
+ if (unlikely(entry == rxq->nb_rx_desc - 1))
+ dma_addr |= MACB_BIT(RX_WRAP);
+ desc->ctrl = 0;
+ /* Setting addr clears RX_USED and allows reception,
+ * make sure ctrl is cleared first to avoid a race.
+ */
+ rte_wmb();
+ macb_set_addr(bp, desc, dma_addr);
+ }
+ if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc))
+ rxq->rxrearm_start = 0;
+ rxq->rxrearm_nb -= MACB_RXQ_REARM_THRESH;
+}
+
+static inline void macb_pkts_to_ptype_v(struct rte_mbuf **rx_pkts)
+{
+ if (likely(rx_pkts[0]->buf_addr != NULL))
+ rx_pkts[0]->packet_type = macb_get_packet_type(rx_pkts[0]);
+
+ if (likely(rx_pkts[1]->buf_addr != NULL))
+ rx_pkts[1]->packet_type = macb_get_packet_type(rx_pkts[1]);
+
+ if (likely(rx_pkts[2]->buf_addr != NULL))
+ rx_pkts[2]->packet_type = macb_get_packet_type(rx_pkts[2]);
+
+ if (likely(rx_pkts[3]->buf_addr != NULL))
+ rx_pkts[3]->packet_type = macb_get_packet_type(rx_pkts[3]);
+}
+
+static inline void macb_pkts_to_port_v(struct rte_mbuf **rx_pkts, uint16_t port_id)
+{
+ rx_pkts[0]->port = port_id;
+ rx_pkts[1]->port = port_id;
+ rx_pkts[2]->port = port_id;
+ rx_pkts[3]->port = port_id;
+}
+
+static inline void macb_free_rx_pkts(struct macb_rx_queue *rxq,
+ struct rte_mbuf **rx_pkts, int pos, uint16_t count)
+{
+ for (int j = 0; j < count; j++) {
+ if (likely(rx_pkts[pos + j] != NULL)) {
+ rte_pktmbuf_free_seg(rx_pkts[pos + j]);
+ rx_pkts[pos + j] = NULL;
+ }
+ }
+ rxq->rx_tail += count;
+ rxq->rxrearm_nb += count;
+ rxq->stats.rx_dropped += count;
+}
+
+static uint16_t macb_recv_raw_pkts_vec(struct macb_rx_queue *rxq,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts, uint8_t *split_packet)
+{
+ struct macb_dma_desc *desc;
+ struct macb_rx_entry *rx_sw_ring;
+ struct macb_rx_entry *rxn;
+ uint16_t nb_pkts_recv = 0;
+ register uint16_t pos;
+ uint16_t bytes_len = 0;
+
+ uint8x16_t shuf_msk = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 0xFF, 0xFF,
+ 4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ };
+ uint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0, rxq->crc_len, 0, 0, 0};
+
+ /* nb_pkts shall be less equal than MACB_MAX_RX_BURST */
+ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, MACB_DESCS_PER_LOOP);
+ nb_pkts = RTE_MIN(nb_pkts, MACB_MAX_RX_BURST);
+
+ desc = rxq->rx_ring + rxq->rx_tail * MACB_DESC_ADDR_INTERVAL;
+ rte_prefetch_non_temporal(desc);
+
+ if (rxq->rxrearm_nb >= MACB_RXQ_REARM_THRESH)
+ macb_rxq_rearm(rxq);
+
+ /* Make hw descriptor updates visible to CPU */
+ rte_rmb();
+
+ /* Before we start moving massive data around, check to see if
+ * there is actually a packet available
+ */
+ if (!((desc->addr & MACB_BIT(RX_USED)) ? true : false))
+ return 0;
+
+ rx_sw_ring = &rxq->rx_sw_ring[rxq->rx_tail];
+ /* A. load 4 packet in one loop
+ * B. copy 4 mbuf point from swring to rx_pkts
+ * C. calc the number of RX_USED bits among the 4 packets
+ * D. fill info. from desc to mbuf
+ */
+ for (pos = 0, nb_pkts_recv = 0; pos < nb_pkts; pos += MACB_DESCS_PER_LOOP,
+ desc += MACB_DESCS_PER_LOOP * MACB_DESC_ADDR_INTERVAL) {
+ uint64x2_t mbp1, mbp2;
+ uint64x2_t descs[MACB_DESCS_PER_LOOP];
+ uint8x16x2_t sterr_tmp1, sterr_tmp2;
+ uint8x16_t staterr;
+ uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+ uint16x8_t pkt_mb_mask;
+ uint16x8_t tmp;
+ uint16_t cur_bytes_len[MACB_DESCS_PER_LOOP] = {0, 0, 0, 0};
+ uint32_t stat;
+ uint16_t nb_used = 0;
+ uint16_t i;
+
+ /* B.1 load 2 mbuf point */
+ mbp1 = vld1q_u64((uint64_t *)&rx_sw_ring[pos]);
+ /* B.2 copy 2 mbuf point into rx_pkts */
+ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
+
+ /* B.1 load 2 mbuf point */
+ mbp2 = vld1q_u64((uint64_t *)&rx_sw_ring[pos + 2]);
+ /* B.2 copy 2 mbuf point into rx_pkts */
+ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
+
+ rte_mbuf_prefetch_part2(rx_pkts[pos]);
+ rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+ rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+ rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+
+ /* A. load 4 pkts descs */
+ descs[0] = vld1q_u64((uint64_t *)(desc));
+ descs[1] = vld1q_u64((uint64_t *)(desc + 1 * MACB_DESC_ADDR_INTERVAL));
+ descs[2] = vld1q_u64((uint64_t *)(desc + 2 * MACB_DESC_ADDR_INTERVAL));
+ descs[3] = vld1q_u64((uint64_t *)(desc + 3 * MACB_DESC_ADDR_INTERVAL));
+
+ rxn = &rx_sw_ring[pos + 0 + MACB_NEON_PREFETCH_ENTRY];
+ rte_prefetch0((char *)rxn->mbuf->buf_addr + rxn->mbuf->data_off);
+ rxn = &rx_sw_ring[pos + 1 + MACB_NEON_PREFETCH_ENTRY];
+ rte_prefetch0((char *)rxn->mbuf->buf_addr + rxn->mbuf->data_off);
+ rxn = &rx_sw_ring[pos + 2 + MACB_NEON_PREFETCH_ENTRY];
+ rte_prefetch0((char *)rxn->mbuf->buf_addr + rxn->mbuf->data_off);
+ rxn = &rx_sw_ring[pos + 3 + MACB_NEON_PREFETCH_ENTRY];
+ rte_prefetch0((char *)rxn->mbuf->buf_addr + rxn->mbuf->data_off);
+
+ /* D.1 pkt convert format from desc to pktmbuf */
+ pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
+ pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
+ pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
+ pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
+
+ /* D.2 pkt 1,2 set length and remove crc */
+ if (split_packet)
+ pkt_mb_mask = vdupq_n_u16(MACB_RX_JFRMLEN_MASK);
+ else
+ pkt_mb_mask = vdupq_n_u16(MACB_RX_FRMLEN_MASK);
+
+ tmp = vsubq_u16(vandq_u16(vreinterpretq_u16_u8(pkt_mb1), pkt_mb_mask), crc_adjust);
+ pkt_mb1 = vreinterpretq_u8_u16(tmp);
+ cur_bytes_len[0] = vgetq_lane_u16(tmp, 2);
+
+ tmp = vsubq_u16(vandq_u16(vreinterpretq_u16_u8(pkt_mb2), pkt_mb_mask), crc_adjust);
+ pkt_mb2 = vreinterpretq_u8_u16(tmp);
+ cur_bytes_len[1] = vgetq_lane_u16(tmp, 2);
+
+ vst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1);
+ vst1q_u8((uint8_t *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb2);
+
+ /* D.2 pkt 3,4 length and remove crc */
+ tmp = vsubq_u16(vandq_u16(vreinterpretq_u16_u8(pkt_mb3), pkt_mb_mask), crc_adjust);
+ pkt_mb3 = vreinterpretq_u8_u16(tmp);
+ cur_bytes_len[2] = vgetq_lane_u16(tmp, 2);
+
+ tmp = vsubq_u16(vandq_u16(vreinterpretq_u16_u8(pkt_mb4), pkt_mb_mask), crc_adjust);
+ pkt_mb4 = vreinterpretq_u8_u16(tmp);
+ cur_bytes_len[3] = vgetq_lane_u16(tmp, 2);
+
+ vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb3);
+ vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb4);
+
+ /*C.1 filter RX_USED or SOF_EOF info only */
+ sterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]),
+ vreinterpretq_u8_u64(descs[2]));
+ sterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]),
+ vreinterpretq_u8_u64(descs[3]));
+
+ /* C* extract and record EOF bit */
+ if (split_packet) {
+ uint8x16_t eof;
+
+ eof = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[1];
+ stat = vgetq_lane_u32(vreinterpretq_u32_u8(eof), 1);
+ /* and with mask to extract bits, flipping 1-0 */
+ *(int *)split_packet = ~stat & MACB_DESC_EOF_MASK;
+
+ split_packet += MACB_DESCS_PER_LOOP;
+ }
+
+ /* C.2 get 4 pkts RX_USED value */
+ staterr = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0];
+
+ /* C.3 expand RX_USED bit to saturate UINT8 */
+ staterr = vshlq_n_u8(staterr, MACB_UINT8_BIT - 1);
+ staterr = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(staterr),
+ MACB_UINT8_BIT - 1));
+ stat = ~vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);
+
+ rte_prefetch_non_temporal(desc + MACB_DESCS_PER_LOOP *
+ MACB_DESC_ADDR_INTERVAL);
+
+ /* C.4 calc available number of desc */
+ if (unlikely(stat == 0))
+ nb_used = MACB_DESCS_PER_LOOP;
+ else
+ nb_used = rte_ctz32(stat) / MACB_UINT8_BIT;
+
+ macb_pkts_to_ptype_v(&rx_pkts[pos]);
+ macb_pkts_to_port_v(&rx_pkts[pos], rxq->port_id);
+
+ if (nb_used == MACB_DESCS_PER_LOOP) {
+ if (split_packet == NULL) {
+ uint8x16_t sof_eof;
+
+ sof_eof = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[1];
+ sof_eof = vreinterpretq_u8_s8
+ (vshrq_n_s8(vreinterpretq_s8_u8(sof_eof),
+ MACB_UINT8_BIT - 2));
+
+ /*get 4 pkts SOF_EOF value*/
+ stat = ~vgetq_lane_u32(vreinterpretq_u32_u8(sof_eof), 1);
+ if (unlikely(stat != 0)) {
+ MACB_LOG(ERR, "not whole frame pointed by descriptor\n");
+ macb_free_rx_pkts(rxq, rx_pkts, pos, MACB_DESCS_PER_LOOP);
+ goto out;
+ }
+ }
+ } else {
+ u32 ctrl;
+
+ if (split_packet == NULL) {
+ for (i = 0; i < nb_used; i++, desc += MACB_DESC_ADDR_INTERVAL) {
+ ctrl = desc->ctrl;
+ if (unlikely((ctrl & (MACB_BIT(RX_SOF) | MACB_BIT(RX_EOF)))
+ != (MACB_BIT(RX_SOF) | MACB_BIT(RX_EOF)))) {
+ MACB_LOG(ERR, "not whole frame pointed by descriptor\n");
+ macb_free_rx_pkts(rxq, rx_pkts, pos, nb_used);
+ goto out;
+ }
+ }
+ }
+ }
+
+ nb_pkts_recv += nb_used;
+ for (i = 0; i < nb_used; i++)
+ bytes_len += (cur_bytes_len[i] + rxq->crc_len);
+
+ if (nb_used < MACB_DESCS_PER_LOOP)
+ break;
+ }
+
+out:
+ rxq->stats.rx_bytes += (unsigned long)bytes_len;
+ rxq->stats.rx_packets += nb_pkts_recv;
+ /* Update our internal tail pointer */
+ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recv);
+ rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+ rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recv);
+ /* Make descriptor updates visible to hardware */
+ rte_wmb();
+
+ return nb_pkts_recv;
+}
+
+uint16_t eth_macb_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ return macb_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+static inline uint16_t macb_reassemble_packets(struct macb_rx_queue *rxq,
+ struct rte_mbuf **rx_bufs,
+ uint16_t nb_bufs,
+ uint8_t *split_flags)
+{
+ struct rte_mbuf *pkts[nb_bufs]; /*finished pkts*/
+ struct rte_mbuf *start = rxq->pkt_first_seg;
+ struct rte_mbuf *end = rxq->pkt_last_seg;
+ unsigned int pkt_idx, buf_idx;
+ struct rte_mbuf *curr = rxq->pkt_last_seg;
+ uint16_t data_bus_width_mask;
+
+ data_bus_width_mask = MACB_DATA_BUS_WIDTH_MASK(rxq->bp->data_bus_width);
+ for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {
+ uint16_t len = 0;
+
+ if (end != NULL) {
+ /* processing a split packet */
+ end = rx_bufs[buf_idx];
+ curr->next = end;
+ len = end->data_len + rxq->crc_len;
+ end->data_len =
+ len ? (len - start->pkt_len) : rxq->bp->rx_buffer_size;
+ end->data_off = RTE_PKTMBUF_HEADROOM & ~data_bus_width_mask;
+
+ start->nb_segs++;
+ rxq->stats.rx_packets--;
+ start->pkt_len += end->data_len;
+
+ if (!split_flags[buf_idx]) {
+ end->next = NULL;
+ /* we need to strip crc for the whole packet */
+ if (unlikely(rxq->crc_len > 0)) {
+ start->pkt_len -= RTE_ETHER_CRC_LEN;
+ if (end->data_len > RTE_ETHER_CRC_LEN) {
+ end->data_len -= RTE_ETHER_CRC_LEN;
+ } else {
+ start->nb_segs--;
+ curr->data_len -= RTE_ETHER_CRC_LEN - end->data_len;
+ curr->next = NULL;
+ /* free up last mbuf */
+ rte_pktmbuf_free_seg(end);
+ }
+ }
+ pkts[pkt_idx++] = start;
+ start = NULL;
+ end = NULL;
+ } else {
+ curr = curr->next;
+ }
+ } else {
+ /* not processing a split packet */
+ if (!split_flags[buf_idx]) {
+ /* not a split packet, save and skip */
+ pkts[pkt_idx++] = rx_bufs[buf_idx];
+ continue;
+ }
+ start = rx_bufs[buf_idx];
+ start->pkt_len = rxq->bp->rx_buffer_size - MACB_RX_DATA_OFFSET
+ - (RTE_PKTMBUF_HEADROOM & data_bus_width_mask);
+ start->data_len = start->pkt_len;
+ start->port = rxq->port_id;
+ curr = start;
+ end = start;
+ }
+ }
+
+ /* save the partial packet for next time */
+ rxq->pkt_first_seg = start;
+ rxq->pkt_last_seg = end;
+ rte_memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));
+ return pkt_idx;
+}
+
+static uint16_t eth_macb_recv_scattered_burst_vec(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ struct macb_rx_queue *rxq = rx_queue;
+ uint8_t split_flags[MACB_MAX_RX_BURST] = {0};
+ uint16_t nb_bufs;
+ const uint64_t *split_fl64;
+ uint16_t i;
+ uint16_t reassemble_packets;
+
+ /* get some new buffers */
+ nb_bufs = macb_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, split_flags);
+ if (nb_bufs == 0)
+ return 0;
+
+ /* happy day case, full burst + no packets to be joined */
+ split_fl64 = (uint64_t *)split_flags;
+ if (rxq->pkt_first_seg == NULL && split_fl64[0] == 0 &&
+ split_fl64[1] == 0 && split_fl64[2] == 0 && split_fl64[3] == 0)
+ return nb_bufs;
+
+ /* reassemble any packets that need reassembly*/
+ i = 0;
+ if (rxq->pkt_first_seg == NULL) {
+ /* find the first split flag, and only reassemble then*/
+ while (i < nb_bufs && !split_flags[i])
+ i++;
+ if (i == nb_bufs)
+ return nb_bufs;
+ }
+
+ reassemble_packets = macb_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
+ &split_flags[i]);
+ return i + reassemble_packets;
+}
+
+uint16_t eth_macb_recv_scattered_pkts_vec(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t retval = 0;
+
+ while (nb_pkts > MACB_MAX_RX_BURST) {
+ uint16_t burst;
+
+ burst = eth_macb_recv_scattered_burst_vec(rx_queue, rx_pkts + retval,
+ MACB_MAX_RX_BURST);
+ retval += burst;
+ nb_pkts -= burst;
+ if (burst < MACB_MAX_RX_BURST)
+ return retval;
+ }
+
+ return retval + eth_macb_recv_scattered_burst_vec(rx_queue,
+ rx_pkts + retval, nb_pkts);
+}
+
+static inline void macb_set_txdesc(struct macb_tx_queue *queue,
+ struct macb_dma_desc *txdesc,
+ struct rte_mbuf **tx_pkts, unsigned int pos)
+{
+ uint32x4_t ctrl_v = vdupq_n_u32(0);
+ uint32x4_t data_len_v = vdupq_n_u32(0);
+ uint32x4_t BIT_TX_USED = vdupq_n_u32(MACB_BIT(TX_USED));
+ uint32x4_t BIT_TX_LAST = vdupq_n_u32(MACB_BIT(TX_LAST));
+ uint32x4_t BIT_TX_WARP = vdupq_n_u32(0);
+ uint32x4_t BIT_TX_UNUSED = vdupq_n_u32(~MACB_BIT(TX_USED));
+ uint64_t buf_dma_addr;
+
+ data_len_v =
+ vsetq_lane_u32((uint32_t)(tx_pkts[0]->data_len), data_len_v, 0);
+ data_len_v =
+ vsetq_lane_u32((uint32_t)(tx_pkts[1]->data_len), data_len_v, 1);
+ data_len_v =
+ vsetq_lane_u32((uint32_t)(tx_pkts[2]->data_len), data_len_v, 2);
+ data_len_v =
+ vsetq_lane_u32((uint32_t)(tx_pkts[3]->data_len), data_len_v, 3);
+
+ ctrl_v = vorrq_u32(vorrq_u32(data_len_v, BIT_TX_USED), BIT_TX_LAST);
+
+ if (unlikely(pos + MACB_DESCS_PER_LOOP == queue->nb_tx_desc)) {
+ BIT_TX_WARP = vsetq_lane_u32(MACB_BIT(TX_WRAP), BIT_TX_WARP, 3);
+ ctrl_v = vorrq_u32(ctrl_v, BIT_TX_WARP);
+ }
+
+ buf_dma_addr = rte_mbuf_data_iova(tx_pkts[0]);
+ macb_set_addr(queue->bp, txdesc, buf_dma_addr);
+ buf_dma_addr = rte_mbuf_data_iova(tx_pkts[1]);
+ macb_set_addr(queue->bp, txdesc + 1 * MACB_DESC_ADDR_INTERVAL,
+ buf_dma_addr);
+ buf_dma_addr = rte_mbuf_data_iova(tx_pkts[2]);
+ macb_set_addr(queue->bp, txdesc + 2 * MACB_DESC_ADDR_INTERVAL,
+ buf_dma_addr);
+ buf_dma_addr = rte_mbuf_data_iova(tx_pkts[3]);
+ macb_set_addr(queue->bp, txdesc + 3 * MACB_DESC_ADDR_INTERVAL,
+ buf_dma_addr);
+
+ ctrl_v = vandq_u32(ctrl_v, BIT_TX_UNUSED);
+ rte_wmb();
+
+ txdesc->ctrl = vgetq_lane_u32(ctrl_v, 0);
+ (txdesc + 1 * MACB_DESC_ADDR_INTERVAL)->ctrl = vgetq_lane_u32(ctrl_v, 1);
+ (txdesc + 2 * MACB_DESC_ADDR_INTERVAL)->ctrl = vgetq_lane_u32(ctrl_v, 2);
+ (txdesc + 3 * MACB_DESC_ADDR_INTERVAL)->ctrl = vgetq_lane_u32(ctrl_v, 3);
+}
+
+static inline uint16_t
+macb_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+ struct macb_tx_queue *queue;
+ struct macb_tx_entry *txe;
+ struct macb_dma_desc *txdesc;
+ struct macb *bp;
+ uint32_t tx_tail;
+ uint16_t nb_xmit_vec;
+ uint16_t nb_tx;
+ uint16_t nb_txok;
+ uint16_t nb_idx;
+ uint64x2_t mbp1, mbp2;
+ uint16x4_t nb_segs_v = vdup_n_u16(0);
+
+ queue = (struct macb_tx_queue *)tx_queue;
+ bp = queue->bp;
+ nb_tx = 0;
+
+ nb_xmit_vec = nb_pkts - nb_pkts % MACB_DESCS_PER_LOOP;
+ tx_tail = queue->tx_tail;
+ txe = &queue->tx_sw_ring[tx_tail];
+ txdesc = queue->tx_ring + tx_tail * MACB_DESC_ADDR_INTERVAL;
+
+ for (nb_idx = 0; nb_idx < nb_xmit_vec; tx_tail += MACB_DESCS_PER_LOOP,
+ nb_idx += MACB_DESCS_PER_LOOP,
+ txdesc += MACB_DESCS_PER_LOOP * MACB_DESC_ADDR_INTERVAL) {
+ nb_segs_v = vset_lane_u16(tx_pkts[nb_tx]->nb_segs, nb_segs_v, 0);
+ nb_segs_v = vset_lane_u16(tx_pkts[nb_tx + 1]->nb_segs, nb_segs_v, 1);
+ nb_segs_v = vset_lane_u16(tx_pkts[nb_tx + 2]->nb_segs, nb_segs_v, 2);
+ nb_segs_v = vset_lane_u16(tx_pkts[nb_tx + 3]->nb_segs, nb_segs_v, 3);
+ if (vmaxv_u16(nb_segs_v) > 1) {
+ queue->tx_tail = macb_tx_ring_wrap(bp, tx_tail);
+ nb_txok = eth_macb_xmit_pkts(queue, &tx_pkts[nb_tx], nb_pkts);
+ nb_tx += nb_txok;
+ goto out;
+ }
+
+ if (likely(txe[nb_tx].mbuf != NULL))
+ rte_pktmbuf_free_seg(txe[nb_tx].mbuf);
+ if (likely(txe[nb_tx + 1].mbuf != NULL))
+ rte_pktmbuf_free_seg(txe[nb_tx + 1].mbuf);
+ if (likely(txe[nb_tx + 2].mbuf != NULL))
+ rte_pktmbuf_free_seg(txe[nb_tx + 2].mbuf);
+ if (likely(txe[nb_tx + 3].mbuf != NULL))
+ rte_pktmbuf_free_seg(txe[nb_tx + 3].mbuf);
+
+ mbp1 = vld1q_u64((uint64_t *)&tx_pkts[nb_tx]);
+ mbp2 = vld1q_u64((uint64_t *)&tx_pkts[nb_tx + 2]);
+ vst1q_u64((uint64_t *)&txe[nb_tx], mbp1);
+ vst1q_u64((uint64_t *)&txe[nb_tx + 2], mbp2);
+
+ queue->stats.tx_bytes +=
+ tx_pkts[nb_tx]->pkt_len + tx_pkts[nb_tx + 1]->pkt_len +
+ tx_pkts[nb_tx + 2]->pkt_len + tx_pkts[nb_tx + 3]->pkt_len;
+ macb_set_txdesc(queue, txdesc, &tx_pkts[nb_tx], tx_tail);
+ queue->stats.tx_packets += MACB_DESCS_PER_LOOP;
+ nb_tx += MACB_DESCS_PER_LOOP;
+ nb_pkts = nb_pkts - MACB_DESCS_PER_LOOP;
+ }
+
+ tx_tail = macb_tx_ring_wrap(bp, tx_tail);
+ queue->tx_tail = tx_tail;
+ if (nb_pkts > 0)
+ nb_tx += eth_macb_xmit_pkts(queue, &tx_pkts[nb_tx], nb_pkts);
+ else
+ macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+
+out:
+ return nb_tx;
+}
+
+uint16_t eth_macb_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ struct macb_tx_queue *queue;
+ struct macb *bp;
+ uint16_t nb_free;
+ uint16_t nb_total_free;
+ uint32_t tx_head, tx_tail;
+ uint16_t nb_tx, nb_total_tx = 0;
+
+ queue = (struct macb_tx_queue *)tx_queue;
+ bp = queue->bp;
+
+ macb_reclaim_txd(queue);
+
+retry:
+ tx_head = queue->tx_head;
+ tx_tail = queue->tx_tail;
+
+ if (unlikely(tx_head == tx_tail))
+ nb_total_free = bp->tx_ring_size - 1;
+ else if (tx_head > tx_tail)
+ nb_total_free = tx_head - tx_tail - 1;
+ else
+ nb_total_free = bp->tx_ring_size - (tx_tail - tx_head) - 1;
+
+ nb_pkts = RTE_MIN(nb_total_free, nb_pkts);
+ nb_free = bp->tx_ring_size - tx_tail;
+
+ if (nb_pkts > nb_free && nb_free > 0) {
+ nb_tx = macb_xmit_pkts_vec(queue, tx_pkts, nb_free);
+ nb_total_tx += nb_tx;
+ nb_pkts -= nb_tx;
+ tx_pkts += nb_tx;
+ goto retry;
+ }
+ if (nb_pkts > 0)
+ nb_total_tx += macb_xmit_pkts_vec(queue, tx_pkts, nb_pkts);
+
+ return nb_total_tx;
+}
diff --git a/drivers/net/macb/meson.build b/drivers/net/macb/meson.build
index 29807c0..28f7b37 100644
--- a/drivers/net/macb/meson.build
+++ b/drivers/net/macb/meson.build
@@ -15,4 +15,8 @@ sources = files(
'macb_rxtx.c',
)
+if host_machine.cpu_family() == 'aarch64'
+ sources += files('macb_rxtx_vec_neon.c')
+endif
+
includes += include_directories('base')
--
2.7.4
next prev parent reply other threads:[~2024-12-10 7:06 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-01 10:07 [PATCH v1 1/2] net/macb: add new driver liwencheng
2024-11-01 10:07 ` [PATCH v1 2/2] /usertools/dpdk-devbind:add the binding and unbinding of platform device liwencheng
2024-11-05 8:43 ` [PATCH v2 3/3] usertools/dpdk-devbind: add platform device bind/unbind liwencheng
2024-11-06 3:34 ` [PATCH v2 3/3] usertools/dpdk-devbind: add bind/unbind for platform device liwencheng
2024-11-11 7:43 ` liwencheng
2024-12-03 8:15 ` [PATCH][v2, " Wencheng Li
2024-11-06 3:55 ` [PATCH v1 2/2] /usertools/dpdk-devbind:add the binding and unbinding of " Stephen Hemminger
2024-11-12 1:08 ` liwencheng
2024-11-01 16:13 ` [PATCH v1 1/2] net/macb: add new driver Stephen Hemminger
2024-11-01 17:42 ` Stephen Hemminger
2024-11-02 5:43 ` Stephen Hemminger
2024-11-05 8:41 ` [PATCH v2 1/3] " liwencheng
2024-11-05 8:41 ` [PATCH v2 2/3] net/macb: add NEON vectorized Rx/Tx liwencheng
2024-11-06 3:33 ` liwencheng
2024-11-11 7:42 ` liwencheng
2024-12-06 8:08 ` [PATCH v3 " Wencheng Li
2024-12-10 7:05 ` Wencheng Li [this message]
2024-12-10 7:07 ` [PATCH v3 3/6] net/macb: fix logic error in macb_rxq_rearm function Wencheng Li
2024-11-06 3:32 ` [PATCH v2 1/3] net/macb: add new driver liwencheng
2024-11-11 7:42 ` liwencheng
2024-12-06 8:06 ` [PATCH v3 " Wencheng Li
2024-12-10 7:04 ` [PATCH v3 1/6] " Wencheng Li
2024-12-10 7:07 ` [PATCH v3 4/6] net/macb: zero ol_flags in each recv function Wencheng Li
2024-12-10 7:08 ` [PATCH v3 5/6] net/macb: fix tab errors in meson.build file Wencheng Li
2024-12-10 19:46 ` Stephen Hemminger
2024-12-02 20:31 ` [PATCH v2 1/3] net/macb: add new driver Stephen Hemminger
2024-11-22 17:55 ` Stephen Hemminger
2024-11-22 20:38 ` Stephen Hemminger
2024-11-22 20:44 ` Stephen Hemminger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1733814356-412059-1-git-send-email-liwencheng@phytium.com.cn \
--to=liwencheng@phytium.com.cn \
--cc=dev@dpdk.org \
--cc=stephen@networkplumber.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).