DPDK patches and discussions
 help / color / mirror / Atom feed
From: Stephen Hemminger <stephen@networkplumber.org>
To: dev@dpdk.org
Cc: Stephen Hemminger <stephen@networkplumber.org>
Subject: [PATCH v3 9/9] net/ioring: support multi-segment Rx and Tx
Date: Tue, 11 Mar 2025 16:51:27 -0700	[thread overview]
Message-ID: <20250311235424.172440-10-stephen@networkplumber.org> (raw)
In-Reply-To: <20250311235424.172440-1-stephen@networkplumber.org>

Use readv/writev to handle multi-segment transmit and receive.
Account for virtio header that will be used for offload (later).

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/ioring/rte_eth_ioring.c | 140 ++++++++++++++++++++--------
 1 file changed, 102 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ioring/rte_eth_ioring.c b/drivers/net/ioring/rte_eth_ioring.c
index 83446dc660..a803a9820b 100644
--- a/drivers/net/ioring/rte_eth_ioring.c
+++ b/drivers/net/ioring/rte_eth_ioring.c
@@ -18,6 +18,7 @@
 #include <linux/if.h>
 #include <linux/if_arp.h>
 #include <linux/if_tun.h>
+#include <linux/virtio_net.h>
 
 #include <bus_vdev_driver.h>
 #include <ethdev_driver.h>
@@ -35,8 +36,11 @@
 #define IORING_MAX_QUEUES	128
 static_assert(IORING_MAX_QUEUES <= RTE_MP_MAX_FD_NUM, "Max queues exceeds MP fd limit");
 
-#define IORING_TX_OFFLOAD	RTE_ETH_TX_OFFLOAD_VLAN_INSERT
-#define IORING_RX_OFFLOAD	RTE_ETH_RX_OFFLOAD_VLAN_STRIP
+#define IORING_TX_OFFLOAD	(RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
+				 RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
+
+#define IORING_RX_OFFLOAD	(RTE_ETH_RX_OFFLOAD_VLAN_STRIP | \
+				 RTE_ETH_RX_OFFLOAD_SCATTER)
 
 #define IORING_DEFAULT_IFNAME	"itap%d"
 #define IORING_MP_KEY		"ioring_mp_send_fds"
@@ -166,7 +170,7 @@ tap_open(const char *name, struct ifreq *ifr, uint8_t persist)
 		goto error;
 	}
 
-	int flags = IFF_TAP | IFF_MULTI_QUEUE | IFF_NO_PI;
+	int flags = IFF_TAP | IFF_MULTI_QUEUE | IFF_NO_PI | IFF_VNET_HDR;
 	if ((features & flags) != flags) {
 		PMD_LOG(ERR, "TUN features %#x missing support for %#x",
 			features, features & flags);
@@ -354,6 +358,8 @@ eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->max_rx_queues = IORING_MAX_QUEUES;
 	dev_info->max_tx_queues = IORING_MAX_QUEUES;
 	dev_info->min_rx_bufsize = 0;
+	dev_info->tx_queue_offload_capa = IORING_TX_OFFLOAD;
+	dev_info->tx_offload_capa = dev_info->tx_queue_offload_capa;
 
 	dev_info->default_rxportconf = (struct rte_eth_dev_portconf) {
 		.burst_size = IORING_DEFAULT_BURST,
@@ -487,13 +493,44 @@ eth_rx_submit(struct rx_queue *rxq, int fd, struct rte_mbuf *mb)
 		PMD_LOG(DEBUG, "io_uring no rx sqe");
 		rxq->rx_errors++;
 		rte_pktmbuf_free(mb);
-	} else {
-		void *base = rte_pktmbuf_mtod(mb, void *);
-		size_t len = mb->buf_len;
+		return;
+	}
 
-		io_uring_prep_read(sqe, fd, base, len, 0);
-		io_uring_sqe_set_data(sqe, mb);
+	RTE_VERIFY(mb->nb_segs < IOV_MAX);
+	struct iovec iovs[IOV_MAX];
+
+	for (uint16_t i = 0; i < mb->nb_segs; i++) {
+		iovs[i].iov_base = rte_pktmbuf_mtod(mb, void *);
+		iovs[i].iov_len = rte_pktmbuf_tailroom(mb);
+		mb = mb->next;
 	}
+	io_uring_sqe_set_data(sqe, mb);
+	io_uring_prep_readv(sqe, fd, iovs, mb->nb_segs, 0);
+}
+
+static struct rte_mbuf *
+eth_ioring_rx_alloc(struct rx_queue *rxq)
+{
+	const struct rte_eth_dev *dev = &rte_eth_devices[rxq->port_id];
+	int buf_size = dev->data->mtu + sizeof(struct virtio_net_hdr);
+	struct rte_mbuf *m = NULL;
+	struct rte_mbuf **tail = &m;
+
+	do {
+		struct rte_mbuf *seg = rte_pktmbuf_alloc(rxq->mb_pool);
+		if (unlikely(seg == NULL)) {
+			rte_pktmbuf_free(m);
+			return NULL;
+		}
+		*tail = seg;
+		tail = &seg->next;
+		if (seg != m)
+			++m->nb_segs;
+
+		buf_size -= rte_pktmbuf_tailroom(seg);
+	} while (buf_size > 0);
+
+	return m;
 }
 
 static uint16_t
@@ -513,7 +550,8 @@ eth_ioring_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		PMD_RX_LOG(DEBUG, "cqe %u len %zd", num_cqe, len);
 		num_cqe++;
 
-		if (unlikely(len < RTE_ETHER_HDR_LEN)) {
+		struct virtio_net_hdr *hdr;
+		if (unlikely(len < (ssize_t)(sizeof(*hdr) + RTE_ETHER_HDR_LEN))) {
 			if (len < 0)
 				PMD_LOG(ERR, "io_uring_read: %s", strerror(-len));
 			else
@@ -523,19 +561,31 @@ eth_ioring_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 			goto resubmit;
 		}
 
-		struct rte_mbuf *nmb = rte_pktmbuf_alloc(rxq->mb_pool);
-		if (unlikely(nmb == 0)) {
-			PMD_LOG(DEBUG, "Rx mbuf alloc failed");
+		hdr = rte_pktmbuf_mtod(mb, struct virtio_net_hdr *);
+
+		struct rte_mbuf *nmb = eth_ioring_rx_alloc(rxq);
+		if (!nmb) {
 			++rxq->rx_nombuf;
 			goto resubmit;
 		}
 
-		if (rxq->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP)
-			rte_vlan_strip(mb);
+		len -= sizeof(*hdr);
+		mb->data_off += sizeof(*hdr);
 
 		mb->pkt_len = len;
-		mb->data_len = len;
 		mb->port = rxq->port_id;
+		struct rte_mbuf *seg = mb;
+		for (;;) {
+			seg->data_len = RTE_MIN(len, mb->buf_len);
+			seg = seg->next;
+			len -= seg->data_len;
+		} while (len > 0 && seg != NULL);
+
+		RTE_VERIFY(!(seg == NULL && len > 0));
+
+		if (rxq->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP)
+			rte_vlan_strip(mb);
+
 		__rte_mbuf_sanity_check(mb, 1);
 
 		num_bytes += len;
@@ -555,6 +605,7 @@ eth_ioring_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return num_rx;
 }
 
+
 static int
 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id, uint16_t nb_rx_desc,
 		   unsigned int socket_id, const struct rte_eth_rxconf *rx_conf,
@@ -587,20 +638,17 @@ eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_id, uint16_t nb_rx_de
 		return -1;
 	}
 
-	struct rte_mbuf **mbufs = alloca(nb_rx_desc * sizeof(struct rte_mbuf *));
-	if (mbufs == NULL) {
-		PMD_LOG(ERR, "alloca for %u failed", nb_rx_desc);
-		return -1;
-	}
+	int fd = eth_queue_fd(rxq->port_id, rxq->queue_id);
 
-	if (rte_pktmbuf_alloc_bulk(mb_pool, mbufs, nb_rx_desc) < 0) {
-		PMD_LOG(ERR, "Rx mbuf alloc %u bufs failed", nb_rx_desc);
-		return -1;
-	}
+	for (uint16_t i = 0; i < nb_rx_desc; i++) {
+		struct rte_mbuf *mb = eth_ioring_rx_alloc(rxq);
+		if (mb == NULL) {
+			PMD_LOG(ERR, "Rx mbuf alloc buf failed");
+			return -1;
+		}
 
-	int fd = eth_queue_fd(rxq->port_id, rxq->queue_id);
-	for (uint16_t i = 0; i < nb_rx_desc; i++)
-		eth_rx_submit(rxq, fd, mbufs[i]);
+		eth_rx_submit(rxq, fd, mb);
+	}
 
 	io_uring_submit(&rxq->io_ring);
 	return 0;
@@ -740,31 +788,47 @@ eth_ioring_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
 	PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);
 
-	if (io_uring_sq_space_left(&txq->io_ring) < txq->free_thresh)
+	if (likely(io_uring_sq_space_left(&txq->io_ring) < txq->free_thresh))
 		eth_ioring_tx_cleanup(txq);
 
 	int fd = eth_queue_fd(txq->port_id, txq->queue_id);
 
 	for (num_tx = 0; num_tx < nb_pkts; num_tx++) {
 		struct rte_mbuf *mb = bufs[num_tx];
+		struct virtio_net_hdr *hdr;
 
 		struct io_uring_sqe *sqe = io_uring_get_sqe(&txq->io_ring);
 		if (sqe == NULL)
 			break;	/* submit ring is full */
 
-		io_uring_sqe_set_data(sqe, mb);
+		if (rte_mbuf_refcnt_read(mb) == 1 && RTE_MBUF_DIRECT(mb) &&
+		    rte_pktmbuf_headroom(mb) >= sizeof(*hdr)) {
+			hdr = (struct virtio_net_hdr *)rte_pktmbuf_prepend(mb, sizeof(*hdr));
+		} else {
+			struct rte_mbuf *mh = rte_pktmbuf_alloc(mb->pool);
+			if (unlikely(mh == NULL))
+				break;
+			hdr = (struct virtio_net_hdr *)rte_pktmbuf_append(mh, sizeof(*hdr));
 
-		if (rte_mbuf_refcnt_read(mb) == 1 &&
-		    RTE_MBUF_DIRECT(mb) && mb->nb_segs == 1) {
-			void *base = rte_pktmbuf_mtod(mb, void *);
-			io_uring_prep_write(sqe, fd, base, mb->pkt_len, 0);
+			mh->next = mb;
+			mh->nb_segs = mb->nb_segs + 1;
+			mh->pkt_len += mb->pkt_len;
+			mh->ol_flags = mb->ol_flags & RTE_MBUF_F_TX_OFFLOAD_MASK;
+			mb = mh;
+		}
+		memset(hdr, 0, sizeof(*hdr));
 
-			PMD_TX_LOG(DEBUG, "tx mbuf: %p submit", mb);
-		} else {
-			PMD_LOG(ERR, "Can't do mbuf without space yet!");
-			++txq->tx_errors;
-			continue;
+		io_uring_sqe_set_data(sqe, mb);
+
+		struct iovec iovs[RTE_MBUF_MAX_NB_SEGS + 1];
+		unsigned int niov = mb->nb_segs;
+		for (unsigned int i = 0; i < niov; i++) {
+			iovs[i].iov_base = rte_pktmbuf_mtod(mb, char *);
+			iovs[i].iov_len = mb->data_len;
+			mb = mb->next;
 		}
+
+		io_uring_prep_writev(sqe, fd, iovs, niov, 0);
 	}
 	if (num_tx > 0)
 		io_uring_submit(&txq->io_ring);
-- 
2.47.2


      parent reply	other threads:[~2025-03-11 23:55 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-10 21:23 [RFC 0/8] ioring: network driver Stephen Hemminger
2024-12-10 21:23 ` [RFC 1/8] net/ioring: introduce new driver Stephen Hemminger
2024-12-10 21:23 ` [RFC 2/8] net/ioring: implement link state Stephen Hemminger
2024-12-10 21:23 ` [RFC 3/8] net/ioring: implement control functions Stephen Hemminger
2024-12-10 21:23 ` [RFC 4/8] net/ioring: implement management functions Stephen Hemminger
2024-12-10 21:23 ` [RFC 5/8] net/ioring: implement primary secondary fd passing Stephen Hemminger
2024-12-10 21:23 ` [RFC 6/8] net/ioring: implement receive and transmit Stephen Hemminger
2024-12-10 21:23 ` [RFC 7/8] net/ioring: add VLAN support Stephen Hemminger
2024-12-10 21:23 ` [RFC 8/8] net/ioring: implement statistics Stephen Hemminger
2024-12-11 11:34 ` [RFC 0/8] ioring: network driver Konstantin Ananyev
2024-12-11 15:03   ` Stephen Hemminger
2024-12-12 19:06     ` Konstantin Ananyev
2024-12-19 15:40       ` Morten Brørup
2024-12-20 14:34         ` Konstantin Ananyev
2024-12-20 16:19           ` Stephen Hemminger
2024-12-11 16:28 ` [PATCH v2 " Stephen Hemminger
2024-12-11 16:28   ` [PATCH v2 1/8] net/ioring: introduce new driver Stephen Hemminger
2024-12-28 16:39     ` Morten Brørup
2024-12-11 16:28   ` [PATCH v2 2/8] net/ioring: implement link state Stephen Hemminger
2024-12-11 16:28   ` [PATCH v2 3/8] net/ioring: implement control functions Stephen Hemminger
2024-12-11 16:28   ` [PATCH v2 4/8] net/ioring: implement management functions Stephen Hemminger
2024-12-11 16:28   ` [PATCH v2 5/8] net/ioring: implement primary secondary fd passing Stephen Hemminger
2024-12-11 16:28   ` [PATCH v2 6/8] net/ioring: implement receive and transmit Stephen Hemminger
2024-12-11 16:28   ` [PATCH v2 7/8] net/ioring: add VLAN support Stephen Hemminger
2024-12-11 16:28   ` [PATCH v2 8/8] net/ioring: implement statistics Stephen Hemminger
2025-03-11 23:51 ` [PATCH v3 0/9] ioring PMD device Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 1/9] net/ioring: introduce new driver Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 2/9] net/ioring: implement link state Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 3/9] net/ioring: implement control functions Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 4/9] net/ioring: implement management functions Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 5/9] net/ioring: implement secondary process support Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 6/9] net/ioring: implement receive and transmit Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 7/9] net/ioring: add VLAN support Stephen Hemminger
2025-03-11 23:51   ` [PATCH v3 8/9] net/ioring: implement statistics Stephen Hemminger
2025-03-11 23:51   ` Stephen Hemminger [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250311235424.172440-10-stephen@networkplumber.org \
    --to=stephen@networkplumber.org \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).