DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] net/af_packet: add explicit flush for Tx
@ 2024-08-15 11:56 vignesh.purushotham.srinivas
  0 siblings, 0 replies; only message in thread
From: vignesh.purushotham.srinivas @ 2024-08-15 11:56 UTC (permalink / raw)
  To: linville; +Cc: dev, Vignesh PS

From: Vignesh PS <vignesh.purushotham.srinivas@ericsson.com>

af_packet PMD uses system calls to transmit packets. Separate the
transmit function into two different calls so its possible to avoid
syscalls during transmit.

Signed-off-by: Vignesh PS <vignesh.purushotham.srinivas@ericsson.com>
---
 .mailmap                                  |  1 +
 doc/guides/nics/af_packet.rst             | 26 ++++++-
 drivers/net/af_packet/rte_eth_af_packet.c | 90 ++++++++++++++++++++++-
 3 files changed, 110 insertions(+), 7 deletions(-)

diff --git a/.mailmap b/.mailmap
index 4a508bafad..5e9462b7cd 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1548,6 +1548,7 @@ Viacheslav Ovsiienko <viacheslavo@nvidia.com> <viacheslavo@mellanox.com>
 Victor Kaplansky <victork@redhat.com>
 Victor Raj <victor.raj@intel.com>
 Vidya Sagar Velumuri <vvelumuri@marvell.com>
+Vignesh PS <vignesh.purushotham.srinivas@ericsson.com>
 Vignesh Sridhar <vignesh.sridhar@intel.com>
 Vijayakumar Muthuvel Manickam <mmvijay@gmail.com>
 Vijaya Mohan Guvva <vijay1054@gmail.com>
diff --git a/doc/guides/nics/af_packet.rst b/doc/guides/nics/af_packet.rst
index 66b977e1a2..fe92ef231f 100644
--- a/doc/guides/nics/af_packet.rst
+++ b/doc/guides/nics/af_packet.rst
@@ -29,6 +29,7 @@ Some of these, in turn, will be used to configure the PACKET_MMAP settings.
 *   ``framesz`` - PACKET_MMAP frame size (optional, default 2048B; Note: multiple
     of 16B);
 *   ``framecnt`` - PACKET_MMAP frame count (optional, default 512).
+*   ``explicit_flush`` - enable two stage packet transmit.
 
 Because this implementation is based on PACKET_MMAP, and PACKET_MMAP has its
 own pre-requisites, it should be noted that the inner workings of PACKET_MMAP
@@ -39,6 +40,9 @@ As an example, if one changes ``framesz`` to be 1024B, it is expected that
 ``blocksz`` is set to at least 1024B as well (although 2048B in this case would
 allow two "frames" per "block").
 
+When ``explicit_flush`` is enabled, then the PMD will temporary buffer mbuf in a
+ring buffer in the PMD until ``rte_eth_tx_done_cleanup`` is called on the TX queue.
+
 This restriction happens because PACKET_MMAP expects each single "frame" to fit
 inside of a "block". And although multiple "frames" can fit inside of a single
 "block", a "frame" may not span across two "blocks".
@@ -64,11 +68,25 @@ framecnt=512):
 
 .. code-block:: console
 
-    --vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0
+    --vdev=eth_af_packet0,iface=tap0,blocksz=4096,framesz=2048,framecnt=512,qpairs=1,qdisc_bypass=0,explicit_flush=1
 
 Features and Limitations
 ------------------------
 
-The PMD will re-insert the VLAN tag transparently to the packet if the kernel
-strips it, as long as the ``RTE_ETH_RX_OFFLOAD_VLAN_STRIP`` is not enabled by the
-application.
+* The PMD will re-insert the VLAN tag transparently to the packet if the kernel
+  strips it, as long as the ``RTE_ETH_RX_OFFLOAD_VLAN_STRIP`` is not enabled by the
+  application.
+* The PMD relies on send_to() system call to transmit packets from the PACKET_MMAP socket.
+  This system call can cause head-in-line blocking. Hence, it's advantageous to buffer the
+  packets in the drivers instead of immediately triggering packet transmits on calling
+  ``rte_eth_tx_burst()``. Therefore, the PMD splits the functionality of ``rte_eth_tx_burst()``
+  into two functional stages, where ``rte_eth_tx_burst()`` causes packets to be  be buffered
+  in the driver, and subsequent call to ``rte_eth_tx_done_cleanup()`` triggers the actual
+  packet transmits. With such disaggregated PMD design, it is possible to call
+  ``rte_eth_tx_burst()`` on workers and trigger tramists (by calling
+  ``rte_eth_tx_done_cleanup()``) from a control plane worker and eliminate
+  head-in-line blocking.
+* To enable the two stage packet transmit, the PMD should be started with explicit_flush=1
+  (Default explicit_flush=0).
+* When calling ``rte_eth_tx_done_cleanup()`` the free_cnt parameter has no effect on how
+  many packets are flushed. The PMD will flush all the packets present in the buffer.
diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 6b7b16f348..cdbe43313a 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -36,9 +36,11 @@
 #define ETH_AF_PACKET_FRAMESIZE_ARG	"framesz"
 #define ETH_AF_PACKET_FRAMECOUNT_ARG	"framecnt"
 #define ETH_AF_PACKET_QDISC_BYPASS_ARG	"qdisc_bypass"
+#define ETH_AF_PACKET_EXPLICIT_FLUSH_ARG	"explicit_flush"
 
 #define DFLT_FRAME_SIZE		(1 << 11)
 #define DFLT_FRAME_COUNT	(1 << 9)
+#define DFLT_FRAME_BURST	(32)
 
 struct __rte_cache_aligned pkt_rx_queue {
 	int sockfd;
@@ -62,8 +64,10 @@ struct __rte_cache_aligned pkt_tx_queue {
 
 	struct iovec *rd;
 	uint8_t *map;
+	struct rte_ring *buf;
 	unsigned int framecount;
 	unsigned int framenum;
+	unsigned int explicit_flush;
 
 	volatile unsigned long tx_pkts;
 	volatile unsigned long err_pkts;
@@ -91,6 +95,7 @@ static const char *valid_arguments[] = {
 	ETH_AF_PACKET_FRAMESIZE_ARG,
 	ETH_AF_PACKET_FRAMECOUNT_ARG,
 	ETH_AF_PACKET_QDISC_BYPASS_ARG,
+	ETH_AF_PACKET_EXPLICIT_FLUSH_ARG,
 	NULL
 };
 
@@ -198,7 +203,7 @@ tx_ring_status_available(uint32_t tp_status)
  * Callback to handle sending packets through a real NIC.
  */
 static uint16_t
-eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+eth_af_packet_tx_internal(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
 	struct tpacket2_hdr *ppd;
 	struct rte_mbuf *mbuf;
@@ -311,6 +316,59 @@ eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return i;
 }
 
+/*
+ * Callback to handle sending packets.
+ */
+static uint16_t
+eth_af_packet_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct pkt_tx_queue *pkt_q = queue;
+
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	if (pkt_q->explicit_flush)
+		return rte_ring_enqueue_burst(pkt_q->buf,
+				(void **)bufs, nb_pkts, NULL);
+
+	return eth_af_packet_tx_internal(queue, bufs, nb_pkts);
+}
+
+/*
+ * Callback to flush previously buffer tx packets.
+ */
+static int
+eth_af_packet_tx_flush(void *queue, uint32_t free_cnt __rte_unused)
+{
+	uint16_t sent, nb_pkts;
+	uint16_t num_flushed = 0;
+
+	struct pkt_tx_queue *pkt_q = queue;
+
+	while (true) {
+		/* flush DFLT_FRAME_BURST of buffered pkts every iteration */
+		struct rte_mbuf *bufs[DFLT_FRAME_BURST];
+		nb_pkts = rte_ring_dequeue_burst_start(pkt_q->buf,
+				   (void **)bufs, DFLT_FRAME_BURST, NULL);
+
+		if (unlikely(nb_pkts == 0))
+			break;
+
+		/* If packet are dropped internally by the below
+		 * function, it okay to not include that stats in the
+		 * return of this function because err_pkts is updated
+		 * internally.
+		 */
+		sent = eth_af_packet_tx_internal(queue, bufs, nb_pkts);
+		num_flushed +=  sent;
+
+		/* commit the dequeue operation */
+		rte_ring_dequeue_finish(pkt_q->buf, sent);
+	}
+
+	return num_flushed;
+}
+
 static int
 eth_dev_start(struct rte_eth_dev *dev)
 {
@@ -637,6 +695,7 @@ static const struct eth_dev_ops ops = {
 	.link_update = eth_link_update,
 	.stats_get = eth_stats_get,
 	.stats_reset = eth_stats_reset,
+	.tx_done_cleanup = eth_af_packet_tx_flush,
 };
 
 /*
@@ -668,6 +727,7 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
                        unsigned int framesize,
                        unsigned int framecnt,
 		       unsigned int qdisc_bypass,
+		       unsigned int explicit_flush,
                        struct pmd_internals **internals,
                        struct rte_eth_dev **eth_dev,
                        struct rte_kvargs *kvlist)
@@ -885,6 +945,18 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 			goto error;
 		}
 
+		char buf_name[RTE_RING_NAMESIZE];
+		snprintf(buf_name, RTE_RING_NAMESIZE, "%s:txq%u", name, q);
+		tx_queue->buf = rte_ring_create(buf_name, tx_queue->framecount,
+				  numa_node, RING_F_SP_ENQ | RING_F_SC_DEQ);
+		if (tx_queue->buf == NULL) {
+			PMD_LOG(ERR,
+				"%s: could not create ring buffer. err=%s",
+				buf_name, rte_strerror(rte_errno));
+			goto error;
+		}
+		tx_queue->explicit_flush = explicit_flush;
+
 #if defined(PACKET_FANOUT)
 		rc = setsockopt(qsockfd, SOL_PACKET, PACKET_FANOUT,
 				&fanout_arg, sizeof(fanout_arg));
@@ -962,6 +1034,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 	unsigned int framecount = DFLT_FRAME_COUNT;
 	unsigned int qpairs = 1;
 	unsigned int qdisc_bypass = 1;
+	unsigned int explicit_flush = 0;
 
 	/* do some parameter checking */
 	if (*sockfd < 0)
@@ -1024,6 +1097,16 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 			}
 			continue;
 		}
+		if (strstr(pair->key, ETH_AF_PACKET_EXPLICIT_FLUSH_ARG) != NULL) {
+			explicit_flush = atoi(pair->value);
+			if (explicit_flush > 1) {
+				PMD_LOG(ERR,
+					"%s: invalid explicit_flush value",
+					name);
+				return -1;
+			}
+			continue;
+		}
 	}
 
 	if (framesize > blocksize) {
@@ -1049,7 +1132,7 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 	if (rte_pmd_init_internals(dev, *sockfd, qpairs,
 				   blocksize, blockcount,
 				   framesize, framecount,
-				   qdisc_bypass,
+				   qdisc_bypass, explicit_flush,
 				   &internals, &eth_dev,
 				   kvlist) < 0)
 		return -1;
@@ -1146,4 +1229,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_af_packet,
 	"blocksz=<int> "
 	"framesz=<int> "
 	"framecnt=<int> "
-	"qdisc_bypass=<0|1>");
+	"qdisc_bypass=<0|1> "
+	"explicit_flush=<0|1>");
-- 
2.34.1


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-08-15 11:57 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-08-15 11:56 [PATCH] net/af_packet: add explicit flush for Tx vignesh.purushotham.srinivas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).