DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v4] net/memif: zero-copy slave
@ 2019-07-09  8:22 Jakub Grajciar
  2019-07-10 15:06 ` Ferruh Yigit
  2019-08-22  8:18 ` [dpdk-dev] [PATCH v5] " Jakub Grajciar
  0 siblings, 2 replies; 20+ messages in thread
From: Jakub Grajciar @ 2019-07-09  8:22 UTC (permalink / raw)
  To: dev; +Cc: anatoly.burakov, Jakub Grajciar

Zero-copy slave support for memif PMD.
Slave interface exposes DPDK memory to
master interface. Only single file segments
are supported (EAL option --single-file-segments).

Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
---
 doc/guides/nics/memif.rst                     |  29 ++
 drivers/net/memif/Makefile                    |   1 +
 drivers/net/memif/memif_socket.c              |  63 +--
 drivers/net/memif/meson.build                 |   1 +
 drivers/net/memif/rte_eth_memif.c             | 451 +++++++++++++++++-
 drivers/net/memif/rte_eth_memif.h             |   9 +-
 lib/librte_eal/common/eal_common_mcfg.c       |   7 +
 .../common/include/rte_eal_memconfig.h        |  10 +
 lib/librte_eal/rte_eal_version.map            |   1 +
 9 files changed, 503 insertions(+), 69 deletions(-)

V2:
- fix coding style

V3:
- fix compilation issues

V4:
- don't move existing code
- add new EAL API rte_mcfg_get_single_file_segments,
  mem_config is now private, this api returns
  single_file_segments parameter value

diff --git a/doc/guides/nics/memif.rst b/doc/guides/nics/memif.rst
index de2d481eb..46cadb13f 100644
--- a/doc/guides/nics/memif.rst
+++ b/doc/guides/nics/memif.rst
@@ -171,6 +171,35 @@ Files
 - net/memif/memif.h *- descriptor and ring definitions*
 - net/memif/rte_eth_memif.c *- eth_memif_rx() eth_memif_tx()*

+Zero-copy slave
+~~~~~~~~~~~~~~~
+
+**Shared memory format**
+
+Region 0 is created by memif driver and contains rings. Slave interface exposes DPDK memory (memseg).
+Instead of using memfd_create() to create new shared file, existing memsegs are used.
+Master interface functions the same as with zero-copy disabled.
+
+region 0:
+
++-----------------------+
+| Rings                 |
++-----------+-----------+
+| S2M rings | M2S rings |
++-----------+-----------+
+
+region n:
+
++-----------------+
+| Buffers         |
++-----------------+
+|memseg           |
++-----------------+
+
+Buffers are dequeued and enqueued as needed. Offset descriptor field is calculated at tx.
+Only single file segments mode (EAL option --single-file-segments) is supported, as calculating
+offset from multiple segments is too expensive.
+
 Example: testpmd
 ----------------------------
 In this example we run two instances of testpmd application and transmit packets over memif.
diff --git a/drivers/net/memif/Makefile b/drivers/net/memif/Makefile
index fdbdf3378..8f5c6dd0d 100644
--- a/drivers/net/memif/Makefile
+++ b/drivers/net/memif/Makefile
@@ -20,6 +20,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool
 LDLIBS += -lrte_ethdev -lrte_kvargs
 LDLIBS += -lrte_hash
diff --git a/drivers/net/memif/memif_socket.c b/drivers/net/memif/memif_socket.c
index 01a935f87..2bd7f2927 100644
--- a/drivers/net/memif/memif_socket.c
+++ b/drivers/net/memif/memif_socket.c
@@ -176,8 +176,7 @@ memif_msg_receive_hello(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_name, (char *)h->name, sizeof(pmd->remote_name));

-	MIF_LOG(DEBUG, "%s: Connecting to %s.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_name);
+	MIF_LOG(DEBUG, "Connecting to %s.", pmd->remote_name);

 	return 0;
 }
@@ -339,8 +338,7 @@ memif_msg_receive_connect(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -358,8 +356,7 @@ memif_msg_receive_connected(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -370,14 +367,13 @@ memif_msg_receive_disconnect(struct rte_eth_dev *dev, memif_msg_t *msg)
 	struct pmd_internals *pmd = dev->data->dev_private;
 	memif_msg_disconnect_t *d = &msg->disconnect;

-	memset(pmd->remote_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->remote_disc_string, 0, sizeof(pmd->remote_disc_string));
 	strlcpy(pmd->remote_disc_string, (char *)d->string,
 		sizeof(pmd->remote_disc_string));

-	MIF_LOG(INFO, "%s: Disconnect received: %s",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_disc_string);
+	MIF_LOG(INFO, "Disconnect received: %s", pmd->remote_disc_string);

-	memset(pmd->local_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->local_disc_string, 0, 96);
 	memif_disconnect(dev);
 	return 0;
 }
@@ -473,7 +469,6 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connect_t *c;

 	if (e == NULL)
@@ -481,7 +476,7 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)

 	c = &e->msg.connect;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECT;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -491,7 +486,6 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connected_t *c;

 	if (e == NULL)
@@ -499,7 +493,7 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)

 	c = &e->msg.connected;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECTED;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -525,7 +519,6 @@ void
 memif_disconnect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
-	struct pmd_process_private *proc_private = dev->process_private;
 	struct memif_msg_queue_elt *elt, *next;
 	struct memif_queue *mq;
 	struct rte_intr_handle *ih;
@@ -615,7 +608,7 @@ memif_disconnect(struct rte_eth_dev *dev)
 		}
 	}

-	memif_free_regions(proc_private);
+	memif_free_regions(dev);

 	/* reset connection configuration */
 	memset(&pmd->run, 0, sizeof(pmd->run));
@@ -662,7 +655,7 @@ memif_msg_receive(struct memif_control_channel *cc)
 			if (cmsg->cmsg_type == SCM_CREDENTIALS)
 				cr = (struct ucred *)CMSG_DATA(cmsg);
 			else if (cmsg->cmsg_type == SCM_RIGHTS)
-				memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
+				rte_memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
 		}
 		cmsg = CMSG_NXTHDR(&mh, cmsg);
 	}
@@ -861,7 +854,7 @@ memif_listener_handler(void *arg)
 }

 static struct memif_socket *
-memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
+memif_socket_create(char *key, uint8_t listener)
 {
 	struct memif_socket *sock;
 	struct sockaddr_un un;
@@ -899,17 +892,15 @@ memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
 		if (ret < 0)
 			goto error;

-		MIF_LOG(DEBUG, "%s: Memif listener socket %s created.",
-			rte_vdev_device_name(pmd->vdev), sock->filename);
+		MIF_LOG(DEBUG, "Memif listener socket %s created.", sock->filename);

 		sock->intr_handle.fd = sockfd;
 		sock->intr_handle.type = RTE_INTR_HANDLE_EXT;
 		ret = rte_intr_callback_register(&sock->intr_handle,
 						 memif_listener_handler, sock);
 		if (ret < 0) {
-			MIF_LOG(ERR, "%s: Failed to register interrupt "
-				"callback for listener socket",
-				rte_vdev_device_name(pmd->vdev));
+			MIF_LOG(ERR, "Failed to register interrupt "
+				"callback for listener socket");
 			return NULL;
 		}
 	}
@@ -917,8 +908,7 @@ memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
 	return sock;

  error:
-	MIF_LOG(ERR, "%s: Failed to setup socket %s: %s",
-		rte_vdev_device_name(pmd->vdev), key, strerror(errno));
+	MIF_LOG(ERR, "Failed to setup socket %s: %s", key, strerror(errno));
 	if (sock != NULL)
 		rte_free(sock);
 	return NULL;
@@ -960,9 +950,8 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)
 	rte_memcpy(key, socket_filename, strlen(socket_filename));
 	ret = rte_hash_lookup_data(hash, key, (void **)&socket);
 	if (ret < 0) {
-		socket = memif_socket_create(pmd, key,
-					     (pmd->role ==
-					      MEMIF_ROLE_SLAVE) ? 0 : 1);
+		socket = memif_socket_create(key,
+					     (pmd->role == MEMIF_ROLE_SLAVE) ? 0 : 1);
 		if (socket == NULL)
 			return -1;
 		ret = rte_hash_add_key_data(hash, key, socket);
@@ -993,8 +982,7 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)

 	elt = rte_malloc("pmd-queue", sizeof(struct memif_socket_dev_list_elt), 0);
 	if (elt == NULL) {
-		MIF_LOG(ERR, "%s: Failed to add device to socket device list.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to add device to socket device list.");
 		return -1;
 	}
 	elt->dev = dev;
@@ -1068,8 +1056,7 @@ memif_connect_slave(struct rte_eth_dev *dev)

 	sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
 	if (sockfd < 0) {
-		MIF_LOG(ERR, "%s: Failed to open socket.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to open socket.");
 		return -1;
 	}

@@ -1080,19 +1067,16 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = connect(sockfd, (struct sockaddr *)&sun,
 		      sizeof(struct sockaddr_un));
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to connect socket: %s.",
-			rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+		MIF_LOG(ERR, "Failed to connect socket: %s.", pmd->socket_filename);
 		goto error;
 	}

-	MIF_LOG(DEBUG, "%s: Memif socket: %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+	MIF_LOG(DEBUG, "Memif socket: %s connected.", pmd->socket_filename);

 	pmd->cc = rte_zmalloc("memif-cc",
 			      sizeof(struct memif_control_channel), 0);
 	if (pmd->cc == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate control channel.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to allocate control channel.");
 		goto error;
 	}

@@ -1105,8 +1089,7 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = rte_intr_callback_register(&pmd->cc->intr_handle,
 					 memif_intr_handler, pmd->cc);
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to register interrupt callback "
-			"for control fd", rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to register interrupt callback for control fd");
 		goto error;
 	}

diff --git a/drivers/net/memif/meson.build b/drivers/net/memif/meson.build
index bedc97311..6b62255f9 100644
--- a/drivers/net/memif/meson.build
+++ b/drivers/net/memif/meson.build
@@ -14,5 +14,6 @@ allow_experimental_apis = true
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments

 deps += ['hash']
diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index e9ddf6413..2d9aa64c2 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -23,6 +23,10 @@
 #include <rte_kvargs.h>
 #include <rte_bus_vdev.h>
 #include <rte_string_fns.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_eal_memconfig.h>

 #include "rte_eth_memif.h"
 #include "memif_socket.h"
@@ -50,6 +54,10 @@ static const char * const valid_arguments[] = {

 #define MEMIF_MP_SEND_REGION		"memif_mp_send_region"

+
+static int memif_region_init_zc(const struct rte_memseg_list *msl,
+				const struct rte_memseg *ms, void *arg);
+
 const char *
 memif_version(void)
 {
@@ -116,10 +124,14 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 	struct mp_region_msg *reply_param;
 	struct memif_region *r;
 	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
+	/* in case of zero-copy slave, only request region 0 */
+	uint16_t max_region_num = (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) ?
+				   1 : ETH_MEMIF_MAX_REGION_NUM;

 	MIF_LOG(DEBUG, "Requesting memory regions");

-	for (i = 0; i < ETH_MEMIF_MAX_REGION_NUM; i++) {
+	for (i = 0; i < max_region_num; i++) {
 		/* Prepare the message */
 		memset(&msg, 0, sizeof(msg));
 		strlcpy(msg.name, MEMIF_MP_SEND_REGION, sizeof(msg.name));
@@ -161,6 +173,12 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 		free(reply);
 	}

+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)proc_private);
+		if (ret < 0)
+			return ret;
+	}
+
 	return memif_connect(dev);
 }

@@ -218,6 +236,23 @@ memif_get_buffer(struct pmd_process_private *proc_private, memif_desc_t *d)
 	return ((uint8_t *)proc_private->regions[d->region]->addr + d->offset);
 }

+/* Free mbufs received by master */
+static void
+memif_free_stored_mbufs(struct pmd_process_private *proc_private, struct memif_queue *mq)
+{
+	uint16_t mask = (1 << mq->log2_ring_size) - 1;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+
+	/* FIXME: improve performance */
+	while (mq->last_tail != ring->tail) {
+		RTE_MBUF_PREFETCH_TO_FREE(mq->buffers[(mq->last_tail + 1) & mask]);
+		/* Decrement refcnt and free mbuf. (current segment) */
+		rte_mbuf_refcnt_update(mq->buffers[mq->last_tail & mask], -1);
+		rte_pktmbuf_free_seg(mq->buffers[mq->last_tail & mask]);
+		mq->last_tail++;
+	}
+}
+
 static int
 memif_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *cur_tail,
 		    struct rte_mbuf *tail)
@@ -323,8 +358,8 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;

 			memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, dst_off),
-			       (uint8_t *)memif_get_buffer(proc_private, d0) +
-			       src_off, cp_len);
+			       (uint8_t *)memif_get_buffer(proc_private, d0) + src_off,
+			       cp_len);

 			src_off += cp_len;
 			dst_off += cp_len;
@@ -369,6 +404,120 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_rx_pkts;
 }

+static uint16_t
+eth_memif_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0, head;
+	uint16_t n_rx_pkts = 0;
+	memif_desc_t *d0;
+	struct rte_mbuf *mbuf, *mbuf_tail;
+	struct rte_mbuf *mbuf_head = NULL;
+	int ret;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	/* consume interrupt */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t b;
+		ssize_t size __rte_unused;
+		size = read(mq->intr_handle.fd, &b, sizeof(b));
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	cur_slot = mq->last_tail;
+	last_slot = ring->tail;
+	if (cur_slot == last_slot)
+		goto refill;
+	n_slots = last_slot - cur_slot;
+
+	while (n_slots && n_rx_pkts < nb_pkts) {
+		s0 = cur_slot & mask;
+
+		d0 = &ring->desc[s0];
+		mbuf_head = mq->buffers[s0];
+		mbuf = mbuf_head;
+
+next_slot:
+		/* prefetch next descriptor */
+		if (n_rx_pkts + 1 < nb_pkts)
+			rte_prefetch0(&ring->desc[(cur_slot + 1) & mask]);
+
+		mbuf->port = mq->in_port;
+		rte_pktmbuf_data_len(mbuf) = d0->length;
+		rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+
+		mq->n_bytes += rte_pktmbuf_data_len(mbuf);
+
+		cur_slot++;
+		n_slots--;
+		if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
+			mbuf_tail = mbuf;
+			mbuf = mq->buffers[s0];
+			ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+			if (unlikely(ret < 0)) {
+				MIF_LOG(ERR, "number-of-segments-overflow");
+				goto refill;
+			}
+			goto next_slot;
+		}
+
+		*bufs++ = mbuf_head;
+		n_rx_pkts++;
+	}
+
+	mq->last_tail = cur_slot;
+
+/* Supply master with new buffers */
+refill:
+	head = ring->head;
+	n_slots = ring_size - head + mq->last_tail;
+
+	if (n_slots < 32)
+		goto no_free_mbufs;
+
+	ret = rte_pktmbuf_alloc_bulk(mq->mempool, &mq->buffers[head & mask], n_slots);
+	if (unlikely(ret < 0))
+		goto no_free_mbufs;
+
+	while (n_slots--) {
+		s0 = head++ & mask;
+		if (n_slots > 0)
+			rte_prefetch0(mq->buffers[head & mask]);
+		d0 = &ring->desc[s0];
+		/* store buffer header */
+		mbuf = mq->buffers[s0];
+		/* populate descriptor */
+		d0->length = rte_pktmbuf_data_room_size(mq->mempool) -
+				RTE_PKTMBUF_HEADROOM;
+		d0->region = 1;
+		d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+			(uint8_t *)proc_private->regions[d0->region]->addr;
+	}
+no_free_mbufs:
+	rte_mb();
+	ring->head = head;
+
+	mq->n_pkts += n_rx_pkts;
+
+	return n_rx_pkts;
+}
+
 static uint16_t
 eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
@@ -484,19 +633,174 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }

+
+static int
+memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
+		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
+		uint16_t slot, uint16_t n_free)
+{
+	memif_desc_t *d0;
+	int used_slots = 1;
+
+next_in_chain:
+	/* store pointer to mbuf to free it later */
+	mq->buffers[slot & mask] = mbuf;
+	/* Increment refcnt to make sure the buffer is not freed before master
+	 * receives it. (current segment)
+	 */
+	rte_mbuf_refcnt_update(mbuf, 1);
+	/* populate descriptor */
+	d0 = &ring->desc[slot & mask];
+	d0->length = rte_pktmbuf_data_len(mbuf);
+	/* FIXME: get region index */
+	d0->region = 1;
+	d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+		(uint8_t *)proc_private->regions[d0->region]->addr;
+	d0->flags = 0;
+
+	/* check if buffer is chained */
+	if (rte_pktmbuf_is_contiguous(mbuf) == 0) {
+		if (n_free < 2)
+			return 0;
+		/* mark buffer as chained */
+		d0->flags |= MEMIF_DESC_FLAG_NEXT;
+		/* advance mbuf */
+		mbuf = mbuf->next;
+		/* update counters */
+		used_slots++;
+		slot++;
+		n_free--;
+		goto next_in_chain;
+	}
+	return used_slots;
+}
+
+static uint16_t
+eth_memif_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t slot, n_free, ring_size, mask, n_tx_pkts = 0;
+	memif_ring_type_t type = mq->type;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	/* free mbufs received by master */
+	memif_free_stored_mbufs(proc_private, mq);
+
+	/* ring type always MEMIF_RING_S2M */
+	slot = ring->head;
+	n_free = ring_size - ring->head + mq->last_tail;
+
+	int used_slots;
+
+	while (n_free && (n_tx_pkts < nb_pkts)) {
+		while ((n_free > 4) && ((nb_pkts - n_tx_pkts) > 4)) {
+			if ((nb_pkts - n_tx_pkts) > 8) {
+				rte_prefetch0(*bufs + 4);
+				rte_prefetch0(*bufs + 5);
+				rte_prefetch0(*bufs + 6);
+				rte_prefetch0(*bufs + 7);
+			}
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+		}
+		used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+			mask, slot, n_free);
+		if (unlikely(used_slots < 1))
+			goto no_free_slots;
+		n_tx_pkts++;
+		slot += used_slots;
+		n_free -= used_slots;
+	}
+
+no_free_slots:
+	rte_mb();
+	/* update ring pointers */
+	if (type == MEMIF_RING_S2M)
+		ring->head = slot;
+	else
+		ring->tail = slot;
+
+	/* Send interrupt, if enabled. */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t a = 1;
+		ssize_t size = write(mq->intr_handle.fd, &a, sizeof(a));
+		if (unlikely(size < 0)) {
+			MIF_LOG(WARNING,
+				"Failed to send interrupt. %s", strerror(errno));
+		}
+	}
+
+	/* increment queue counters */
+	mq->n_err += nb_pkts - n_tx_pkts;
+	mq->n_pkts += n_tx_pkts;
+
+	return n_tx_pkts;
+}
+
 void
-memif_free_regions(struct pmd_process_private *proc_private)
+memif_free_regions(struct rte_eth_dev *dev)
 {
+	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int i;
 	struct memif_region *r;

-	MIF_LOG(DEBUG, "Free memory regions");
 	/* regions are allocated contiguously, so it's
 	 * enough to loop until 'proc_private->regions_num'
 	 */
 	for (i = 0; i < proc_private->regions_num; i++) {
 		r = proc_private->regions[i];
 		if (r != NULL) {
+			/* This is memzone */
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				r->addr = NULL;
+				if (r->fd > 0)
+					close(r->fd);
+			}
 			if (r->addr != NULL) {
 				munmap(r->addr, r->region_size);
 				if (r->fd > 0) {
@@ -511,6 +815,45 @@ memif_free_regions(struct pmd_process_private *proc_private)
 	proc_private->regions_num = 0;
 }

+static int
+memif_region_init_zc(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+		     void *arg)
+{
+	struct pmd_process_private *proc_private = (struct pmd_process_private *)arg;
+	struct memif_region *r;
+
+	if (proc_private->regions_num < 1) {
+		MIF_LOG(ERR, "Missing descriptor region");
+		return -1;
+	}
+
+	r = proc_private->regions[proc_private->regions_num - 1];
+
+	if (r->addr != msl->base_va)
+		r = proc_private->regions[++proc_private->regions_num - 1];
+
+	if (r == NULL) {
+		r = rte_zmalloc("region", sizeof(struct memif_region), 0);
+		if (r == NULL) {
+			MIF_LOG(ERR, "Failed to alloc memif region.");
+			return -ENOMEM;
+		}
+
+		r->addr = msl->base_va;
+		r->region_size = ms->len;
+		r->fd = rte_memseg_get_fd(ms);
+		if (r->fd < 0)
+			return -1;
+		r->pkt_buffer_offset = 0;
+
+		proc_private->regions[proc_private->regions_num - 1] = r;
+	} else {
+		r->region_size += ms->len;
+	}
+
+	return 0;
+}
+
 static int
 memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 {
@@ -591,12 +934,29 @@ memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 static int
 memif_regions_init(struct rte_eth_dev *dev)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int ret;

-	/* create one buffer region */
-	ret = memif_region_init_shm(dev, /* has buffer */ 1);
-	if (ret < 0)
-		return ret;
+	/*
+	 * Zero-copy exposes dpdk memory.
+	 * Each memseg list will be represented by memif region.
+	 * Zero-copy regions indexing: memseg list idx + 1,
+	 * as we already have region 0 reserved for descriptors.
+	 */
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		/* create region idx 0 containing descriptors */
+		ret = memif_region_init_shm(dev, 0);
+		if (ret < 0)
+			return ret;
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)dev->process_private);
+		if (ret < 0)
+			return ret;
+	} else {
+		/* create one memory region contaning rings and buffers */
+		ret = memif_region_init_shm(dev, /* has buffers */ 1);
+		if (ret < 0)
+			return ret;
+	}

 	return 0;
 }
@@ -616,6 +976,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		ring->tail = 0;
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = i * (1 << pmd->run.log2_ring_size) + j;
 			ring->desc[j].region = 0;
@@ -632,6 +996,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		ring->tail = 0;
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = (i + pmd->run.num_s2m_rings) *
 			    (1 << pmd->run.log2_ring_size) + j;
@@ -645,7 +1013,7 @@ memif_init_rings(struct rte_eth_dev *dev)
 }

 /* called only by slave */
-static void
+static int
 memif_init_queues(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
@@ -666,6 +1034,13 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for tx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}

 	for (i = 0; i < pmd->run.num_m2s_rings; i++) {
@@ -682,7 +1057,15 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for rx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}
+	return 0;
 }

 int
@@ -696,7 +1079,9 @@ memif_init_regions_and_queues(struct rte_eth_dev *dev)

 	memif_init_rings(dev);

-	memif_init_queues(dev);
+	ret = memif_init_queues(dev);
+	if (ret < 0)
+		return ret;

 	return 0;
 }
@@ -720,8 +1105,16 @@ memif_connect(struct rte_eth_dev *dev)
 				mr->addr = mmap(NULL, mr->region_size,
 						PROT_READ | PROT_WRITE,
 						MAP_SHARED, mr->fd, 0);
-				if (mr->addr == NULL)
+				if (mr->addr == MAP_FAILED) {
+					MIF_LOG(ERR, "mmap failed: %s\n",
+						strerror(errno));
 					return -1;
+				}
+			}
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				/* close memseg file */
+				close(mr->fd);
+				mr->fd = -1;
 			}
 		}
 	}
@@ -782,8 +1175,7 @@ memif_dev_start(struct rte_eth_dev *dev)
 		ret = memif_connect_master(dev);
 		break;
 	default:
-		MIF_LOG(ERR, "%s: Unknown role: %d.",
-			rte_vdev_device_name(pmd->vdev), pmd->role);
+		MIF_LOG(ERR, "Unknown role: %d.", pmd->role);
 		ret = -1;
 		break;
 	}
@@ -848,8 +1240,7 @@ memif_tx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("tx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate tx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate tx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -878,8 +1269,7 @@ memif_rx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("rx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate rx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate rx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -920,7 +1310,7 @@ memif_link_update(struct rte_eth_dev *dev,
 			memif_mp_request_regions(dev);
 		} else if (dev->data->dev_link.link_status == ETH_LINK_DOWN &&
 				proc_private->regions_num > 0) {
-			memif_free_regions(proc_private);
+			memif_free_regions(dev);
 		}
 	}
 	return 0;
@@ -964,6 +1354,7 @@ memif_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 		mq = dev->data->tx_queues[i];
 		stats->q_opackets[i] = mq->n_pkts;
 		stats->q_obytes[i] = mq->n_bytes;
+		stats->q_errors[i] = mq->n_err;
 		stats->opackets += mq->n_pkts;
 		stats->obytes += mq->n_bytes;
 		stats->oerrors += mq->n_err;
@@ -1043,11 +1434,6 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	const unsigned int numa_node = vdev->device.numa_node;
 	const char *name = rte_vdev_device_name(vdev);

-	if (flags & ETH_MEMIF_FLAG_ZERO_COPY) {
-		MIF_LOG(ERR, "Zero-copy slave not supported.");
-		return -1;
-	}
-
 	eth_dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
 	if (eth_dev == NULL) {
 		MIF_LOG(ERR, "%s: Unable to allocate device struct.", name);
@@ -1071,6 +1457,9 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	pmd->flags = flags;
 	pmd->flags |= ETH_MEMIF_FLAG_DISABLED;
 	pmd->role = role;
+	/* Zero-copy flag irelevant to master. */
+	if (pmd->role == MEMIF_ROLE_MASTER)
+		pmd->flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;

 	ret = memif_socket_init(eth_dev, socket_filename);
 	if (ret < 0)
@@ -1094,8 +1483,14 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,

 	eth_dev->dev_ops = &ops;
 	eth_dev->device = &vdev->device;
-	eth_dev->rx_pkt_burst = eth_memif_rx;
-	eth_dev->tx_pkt_burst = eth_memif_tx;
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		eth_dev->rx_pkt_burst = eth_memif_rx_zc;
+		eth_dev->tx_pkt_burst = eth_memif_tx_zc;
+	} else {
+		eth_dev->rx_pkt_burst = eth_memif_rx;
+		eth_dev->tx_pkt_burst = eth_memif_tx;
+	}
+

 	eth_dev->data->dev_flags &= RTE_ETH_DEV_CLOSE_REMOVE;

@@ -1127,6 +1522,10 @@ memif_set_zc(const char *key __rte_unused, const char *value, void *extra_args)
 	uint32_t *flags = (uint32_t *)extra_args;

 	if (strstr(value, "yes") != NULL) {
+		if (!rte_mcfg_get_single_file_segments()) {
+			MIF_LOG(ERR, "Zero-copy doesn't support multi-file segments.");
+			return -ENOTSUP;
+		}
 		*flags |= ETH_MEMIF_FLAG_ZERO_COPY;
 	} else if (strstr(value, "no") != NULL) {
 		*flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
diff --git a/drivers/net/memif/rte_eth_memif.h b/drivers/net/memif/rte_eth_memif.h
index 24e8a0914..19e521819 100644
--- a/drivers/net/memif/rte_eth_memif.h
+++ b/drivers/net/memif/rte_eth_memif.h
@@ -63,13 +63,16 @@ struct memif_queue {
 	uint16_t last_head;			/**< last ring head */
 	uint16_t last_tail;			/**< last ring tail */

+	struct rte_mbuf **buffers;
+	/**< Stored mbufs. Used in zero-copy tx. Slave stores transmitted
+	 * mbufs to free them once master has received them.
+	 */
+
 	/* rx/tx info */
 	uint64_t n_pkts;			/**< number of rx/tx packets */
 	uint64_t n_bytes;			/**< number of rx/tx bytes */
 	uint64_t n_err;				/**< number of tx errors */

-	memif_ring_t *ring;			/**< pointer to ring */
-
 	struct rte_intr_handle intr_handle;	/**< interrupt handle */

 	memif_log2_ring_size_t log2_ring_size;	/**< log2 of ring size */
@@ -132,7 +135,7 @@ struct pmd_process_private {
  * @param proc_private
  *   device process private data
  */
-void memif_free_regions(struct pmd_process_private *proc_private);
+void memif_free_regions(struct rte_eth_dev *dev);

 /**
  * Finalize connection establishment process. Map shared memory file
diff --git a/lib/librte_eal/common/eal_common_mcfg.c b/lib/librte_eal/common/eal_common_mcfg.c
index 066549432..03d9d472d 100644
--- a/lib/librte_eal/common/eal_common_mcfg.c
+++ b/lib/librte_eal/common/eal_common_mcfg.c
@@ -161,3 +161,10 @@ rte_mcfg_timer_unlock(void)
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	rte_spinlock_unlock(&mcfg->tlock);
 }
+
+uint32_t
+rte_mcfg_get_single_file_segments(void)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	return mcfg->single_file_segments;
+}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 34b0e44a0..9bb4a57f8 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -109,6 +109,16 @@ __rte_experimental
 void
 rte_mcfg_timer_unlock(void);

+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Get the single_file_segments parameter value from memory configuration.
+ */
+__rte_experimental
+uint32_t
+rte_mcfg_get_single_file_segments(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index d0ee67dbf..3b8f81d53 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -407,4 +407,5 @@ EXPERIMENTAL {
 	rte_lcore_to_cpu_id;
 	rte_mcfg_timer_lock;
 	rte_mcfg_timer_unlock;
+	rte_mcfg_get_single_file_segments;
 };
--
2.17.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v4] net/memif: zero-copy slave
  2019-07-09  8:22 [dpdk-dev] [PATCH v4] net/memif: zero-copy slave Jakub Grajciar
@ 2019-07-10 15:06 ` Ferruh Yigit
  2019-07-23 12:35   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  2019-08-22  8:18 ` [dpdk-dev] [PATCH v5] " Jakub Grajciar
  1 sibling, 1 reply; 20+ messages in thread
From: Ferruh Yigit @ 2019-07-10 15:06 UTC (permalink / raw)
  To: Jakub Grajciar, dev; +Cc: anatoly.burakov

On 7/9/2019 9:22 AM, Jakub Grajciar wrote:
> Zero-copy slave support for memif PMD.
> Slave interface exposes DPDK memory to
> master interface. Only single file segments
> are supported (EAL option --single-file-segments).
> 
> Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>

<...>

> +Zero-copy slave
> +~~~~~~~~~~~~~~~
> +
> +**Shared memory format**
> +
> +Region 0 is created by memif driver and contains rings. Slave interface exposes DPDK memory (memseg).
> +Instead of using memfd_create() to create new shared file, existing memsegs are used.
> +Master interface functions the same as with zero-copy disabled.
> +
> +region 0:
> +
> ++-----------------------+
> +| Rings                 |
> ++-----------+-----------+
> +| S2M rings | M2S rings |
> ++-----------+-----------+
> +
> +region n:
> +
> ++-----------------+
> +| Buffers         |
> ++-----------------+
> +|memseg           |
> ++-----------------+
> +
> +Buffers are dequeued and enqueued as needed. Offset descriptor field is calculated at tx.
> +Only single file segments mode (EAL option --single-file-segments) is supported, as calculating
> +offset from multiple segments is too expensive.
> +

Can you please add sample testpmd command for 'zerocopy' support, this can be
useful to highlight '--single-file-segments' eal argument is required.

Also can you please document what happens if only master or slave provides the
'zero-copy=yes' devarg?

<...>

> @@ -1127,6 +1522,10 @@ memif_set_zc(const char *key __rte_unused, const char *value, void *extra_args)
>  	uint32_t *flags = (uint32_t *)extra_args;
> 
>  	if (strstr(value, "yes") != NULL) {
> +		if (!rte_mcfg_get_single_file_segments()) {
> +			MIF_LOG(ERR, "Zero-copy doesn't support multi-file segments.");
> +			return -ENOTSUP;
> +		}

Why "--single-file-segments" is required? Is it to have big physically
continuous chunk of memory? If so will it work with 2Mb hugepages?
btw, I am having a crash with 2Mb hugepages and zerocopy, but "1Gb & zerocopy" &
"2Mb & non zerocopy" works without crash.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v4] net/memif: zero-copy slave
  2019-07-10 15:06 ` Ferruh Yigit
@ 2019-07-23 12:35   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  0 siblings, 0 replies; 20+ messages in thread
From: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco) @ 2019-07-23 12:35 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: anatoly.burakov



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Wednesday, July 10, 2019 5:07 PM
> To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
> <jgrajcia@cisco.com>; dev@dpdk.org
> Cc: anatoly.burakov@intel.com
> Subject: Re: [dpdk-dev] [PATCH v4] net/memif: zero-copy slave
> 
> On 7/9/2019 9:22 AM, Jakub Grajciar wrote:
> > Zero-copy slave support for memif PMD.
> > Slave interface exposes DPDK memory to master interface. Only single
> > file segments are supported (EAL option --single-file-segments).
> >
> > Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
> 
> <...>
> 
> > +Zero-copy slave
> > +~~~~~~~~~~~~~~~
> > +
> > +**Shared memory format**
> > +
> > +Region 0 is created by memif driver and contains rings. Slave interface
> exposes DPDK memory (memseg).
> > +Instead of using memfd_create() to create new shared file, existing
> memsegs are used.
> > +Master interface functions the same as with zero-copy disabled.
> > +
> > +region 0:
> > +
> > ++-----------------------+
> > +| Rings                 |
> > ++-----------+-----------+
> > +| S2M rings | M2S rings |
> > ++-----------+-----------+
> > +
> > +region n:
> > +
> > ++-----------------+
> > +| Buffers         |
> > ++-----------------+
> > +|memseg           |
> > ++-----------------+
> > +
> > +Buffers are dequeued and enqueued as needed. Offset descriptor field is
> calculated at tx.
> > +Only single file segments mode (EAL option --single-file-segments) is
> > +supported, as calculating offset from multiple segments is too expensive.
> > +
> 
> Can you please add sample testpmd command for 'zerocopy' support, this can
> be useful to highlight '--single-file-segments' eal argument is required.
> 
> Also can you please document what happens if only master or slave provides
> the 'zero-copy=yes' devarg?
> 

Will do.

> <...>
> 
> > @@ -1127,6 +1522,10 @@ memif_set_zc(const char *key __rte_unused,
> const char *value, void *extra_args)
> >  	uint32_t *flags = (uint32_t *)extra_args;
> >
> >  	if (strstr(value, "yes") != NULL) {
> > +		if (!rte_mcfg_get_single_file_segments()) {
> > +			MIF_LOG(ERR, "Zero-copy doesn't support multi-file
> segments.");
> > +			return -ENOTSUP;
> > +		}
> 
> Why "--single-file-segments" is required? Is it to have big physically
> continuous chunk of memory? If so will it work with 2Mb hugepages?

Single file segments limitation will be explained in docs in next patch. Simply put zero-copy with multi file segments has worse performance than non zero-copy due to slow memseg lookup.

> btw, I am having a crash with 2Mb hugepages and zerocopy, but "1Gb &
> zerocopy" & "2Mb & non zerocopy" works without crash.

This is strange, I've only tested using 2Mb hugepages without a crash. Can you please provide more details?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-07-09  8:22 [dpdk-dev] [PATCH v4] net/memif: zero-copy slave Jakub Grajciar
  2019-07-10 15:06 ` Ferruh Yigit
@ 2019-08-22  8:18 ` Jakub Grajciar
  2019-10-04 13:23   ` Ferruh Yigit
                     ` (2 more replies)
  1 sibling, 3 replies; 20+ messages in thread
From: Jakub Grajciar @ 2019-08-22  8:18 UTC (permalink / raw)
  To: dev; +Cc: Jakub Grajciar

Zero-copy slave support for memif PMD.
Slave interface exposes DPDK memory to
master interface. Only single file segments
are supported (EAL option --single-file-segments).

Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
---
 doc/guides/nics/memif.rst                     |  42 +-
 drivers/net/memif/Makefile                    |   1 +
 drivers/net/memif/memif_socket.c              |  64 +--
 drivers/net/memif/meson.build                 |   1 +
 drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
 drivers/net/memif/rte_eth_memif.h             |  11 +-
 lib/librte_eal/common/eal_common_mcfg.c       |   7 +
 .../common/include/rte_eal_memconfig.h        |  10 +
 lib/librte_eal/rte_eal_version.map            |   1 +
 9 files changed, 513 insertions(+), 73 deletions(-)

V2:
- fix coding style

V3:
- fix compilation issues

V4:
- don't move existing code
- add new EAL API rte_mcfg_get_single_file_segments,
  mem_config is now private, this api returns
  single_file_segments parameter value

V5:
- explain single file segments limitation
- add zero-copy slave example

diff --git a/doc/guides/nics/memif.rst b/doc/guides/nics/memif.rst
index de2d481eb..31a10848c 100644
--- a/doc/guides/nics/memif.rst
+++ b/doc/guides/nics/memif.rst
@@ -45,7 +45,7 @@ client.
    "socket=/tmp/memif.sock", "Socket filename", "/tmp/memif.sock", "string len 256"
    "mac=01:23:45:ab:cd:ef", "Mac address", "01:ab:23:cd:45:ef", ""
    "secret=abc123", "Secret is an optional security option, which if specified, must be matched by peer", "", "string len 24"
-   "zero-copy=yes", "Enable/disable zero-copy slave mode", "no", "yes|no"
+   "zero-copy=yes", "Enable/disable zero-copy slave mode. Only relevant to slave, requires '--single-file-segments' eal argument", "no", "yes|no"

 **Connection establishment**

@@ -171,6 +171,42 @@ Files
 - net/memif/memif.h *- descriptor and ring definitions*
 - net/memif/rte_eth_memif.c *- eth_memif_rx() eth_memif_tx()*

+Zero-copy slave
+~~~~~~~~~~~~~~~
+
+Zero-copy slave can be enabled with memif configuration option 'zero-copy=yes'. This option
+is only relevant to slave and requires eal argument '--single-file-segments'.
+This limitation is in place, because it is too expensive to identify memseg
+for each packet buffer, resulting in worse performance than with zero-copy disabled.
+With single file segments we can calculate offset from the beginning of the file
+for each packet buffer.
+
+**Shared memory format**
+
+Region 0 is created by memif driver and contains rings. Slave interface exposes DPDK memory (memseg).
+Instead of using memfd_create() to create new shared file, existing memsegs are used.
+Master interface functions the same as with zero-copy disabled.
+
+region 0:
+
++-----------------------+
+| Rings                 |
++-----------+-----------+
+| S2M rings | M2S rings |
++-----------+-----------+
+
+region n:
+
++-----------------+
+| Buffers         |
++-----------------+
+|memseg           |
++-----------------+
+
+Buffers are dequeued and enqueued as needed. Offset descriptor field is calculated at tx.
+Only single file segments mode (EAL option --single-file-segments) is supported, as calculating
+offset from multiple segments is too expensive.
+
 Example: testpmd
 ----------------------------
 In this example we run two instances of testpmd application and transmit packets over memif.
@@ -183,6 +219,10 @@ Now create ``slave`` interface (master must be already running so the slave will

     #./build/app/testpmd -l 2-3 --proc-type=primary --file-prefix=pmd2 --vdev=net_memif -- -i

+You can also enable ``zero-copy`` on ``slave`` interface::
+
+    #./build/app/testpmd -l 2-3 --proc-type=primary --file-prefix=pmd2 --vdev=net_memif,zero-copy=yes --single-file-segments -- -i
+
 Start forwarding packets::

     Slave:
diff --git a/drivers/net/memif/Makefile b/drivers/net/memif/Makefile
index 3d92b08f2..b50fc1cdb 100644
--- a/drivers/net/memif/Makefile
+++ b/drivers/net/memif/Makefile
@@ -20,6 +20,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool
 LDLIBS += -lrte_ethdev -lrte_kvargs -lrte_net
 LDLIBS += -lrte_hash
diff --git a/drivers/net/memif/memif_socket.c b/drivers/net/memif/memif_socket.c
index 6740e3471..4710e05bd 100644
--- a/drivers/net/memif/memif_socket.c
+++ b/drivers/net/memif/memif_socket.c
@@ -176,8 +176,7 @@ memif_msg_receive_hello(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_name, (char *)h->name, sizeof(pmd->remote_name));

-	MIF_LOG(DEBUG, "%s: Connecting to %s.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_name);
+	MIF_LOG(DEBUG, "Connecting to %s.", pmd->remote_name);

 	return 0;
 }
@@ -339,8 +338,7 @@ memif_msg_receive_connect(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -358,8 +356,7 @@ memif_msg_receive_connected(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -370,14 +367,13 @@ memif_msg_receive_disconnect(struct rte_eth_dev *dev, memif_msg_t *msg)
 	struct pmd_internals *pmd = dev->data->dev_private;
 	memif_msg_disconnect_t *d = &msg->disconnect;

-	memset(pmd->remote_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->remote_disc_string, 0, sizeof(pmd->remote_disc_string));
 	strlcpy(pmd->remote_disc_string, (char *)d->string,
 		sizeof(pmd->remote_disc_string));

-	MIF_LOG(INFO, "%s: Disconnect received: %s",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_disc_string);
+	MIF_LOG(INFO, "Disconnect received: %s", pmd->remote_disc_string);

-	memset(pmd->local_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->local_disc_string, 0, 96);
 	memif_disconnect(dev);
 	return 0;
 }
@@ -473,7 +469,6 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connect_t *c;

 	if (e == NULL)
@@ -481,7 +476,7 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)

 	c = &e->msg.connect;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECT;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -491,7 +486,6 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connected_t *c;

 	if (e == NULL)
@@ -499,7 +493,7 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)

 	c = &e->msg.connected;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECTED;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -525,7 +519,6 @@ void
 memif_disconnect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
-	struct pmd_process_private *proc_private = dev->process_private;
 	struct memif_msg_queue_elt *elt, *next;
 	struct memif_queue *mq;
 	struct rte_intr_handle *ih;
@@ -615,7 +608,7 @@ memif_disconnect(struct rte_eth_dev *dev)
 		}
 	}

-	memif_free_regions(proc_private);
+	memif_free_regions(dev);

 	/* reset connection configuration */
 	memset(&pmd->run, 0, sizeof(pmd->run));
@@ -662,7 +655,7 @@ memif_msg_receive(struct memif_control_channel *cc)
 			if (cmsg->cmsg_type == SCM_CREDENTIALS)
 				cr = (struct ucred *)CMSG_DATA(cmsg);
 			else if (cmsg->cmsg_type == SCM_RIGHTS)
-				memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
+				rte_memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
 		}
 		cmsg = CMSG_NXTHDR(&mh, cmsg);
 	}
@@ -861,7 +854,7 @@ memif_listener_handler(void *arg)
 }

 static struct memif_socket *
-memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
+memif_socket_create(char *key, uint8_t listener)
 {
 	struct memif_socket *sock;
 	struct sockaddr_un un;
@@ -899,17 +892,15 @@ memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
 		if (ret < 0)
 			goto error;

-		MIF_LOG(DEBUG, "%s: Memif listener socket %s created.",
-			rte_vdev_device_name(pmd->vdev), sock->filename);
+		MIF_LOG(DEBUG, "Memif listener socket %s created.", sock->filename);

 		sock->intr_handle.fd = sockfd;
 		sock->intr_handle.type = RTE_INTR_HANDLE_EXT;
 		ret = rte_intr_callback_register(&sock->intr_handle,
 						 memif_listener_handler, sock);
 		if (ret < 0) {
-			MIF_LOG(ERR, "%s: Failed to register interrupt "
-				"callback for listener socket",
-				rte_vdev_device_name(pmd->vdev));
+			MIF_LOG(ERR, "Failed to register interrupt "
+				"callback for listener socket");
 			return NULL;
 		}
 	}
@@ -917,9 +908,7 @@ memif_socket_create(struct pmd_internals *pmd, char *key, uint8_t listener)
 	return sock;

  error:
-	MIF_LOG(ERR, "%s: Failed to setup socket %s: %s",
-		rte_vdev_device_name(pmd->vdev) ?
-		rte_vdev_device_name(pmd->vdev) : "NULL", key, strerror(errno));
+	MIF_LOG(ERR, "Failed to setup socket %s: %s", key, strerror(errno));
 	if (sock != NULL)
 		rte_free(sock);
 	if (sockfd >= 0)
@@ -963,9 +952,8 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)
 	rte_memcpy(key, socket_filename, strlen(socket_filename));
 	ret = rte_hash_lookup_data(hash, key, (void **)&socket);
 	if (ret < 0) {
-		socket = memif_socket_create(pmd, key,
-					     (pmd->role ==
-					      MEMIF_ROLE_SLAVE) ? 0 : 1);
+		socket = memif_socket_create(key,
+					     (pmd->role == MEMIF_ROLE_SLAVE) ? 0 : 1);
 		if (socket == NULL)
 			return -1;
 		ret = rte_hash_add_key_data(hash, key, socket);
@@ -996,8 +984,7 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)

 	elt = rte_malloc("pmd-queue", sizeof(struct memif_socket_dev_list_elt), 0);
 	if (elt == NULL) {
-		MIF_LOG(ERR, "%s: Failed to add device to socket device list.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to add device to socket device list.");
 		return -1;
 	}
 	elt->dev = dev;
@@ -1075,8 +1062,7 @@ memif_connect_slave(struct rte_eth_dev *dev)

 	sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
 	if (sockfd < 0) {
-		MIF_LOG(ERR, "%s: Failed to open socket.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to open socket.");
 		return -1;
 	}

@@ -1087,19 +1073,16 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = connect(sockfd, (struct sockaddr *)&sun,
 		      sizeof(struct sockaddr_un));
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to connect socket: %s.",
-			rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+		MIF_LOG(ERR, "Failed to connect socket: %s.", pmd->socket_filename);
 		goto error;
 	}

-	MIF_LOG(DEBUG, "%s: Memif socket: %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+	MIF_LOG(DEBUG, "Memif socket: %s connected.", pmd->socket_filename);

 	pmd->cc = rte_zmalloc("memif-cc",
 			      sizeof(struct memif_control_channel), 0);
 	if (pmd->cc == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate control channel.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to allocate control channel.");
 		goto error;
 	}

@@ -1112,8 +1095,7 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = rte_intr_callback_register(&pmd->cc->intr_handle,
 					 memif_intr_handler, pmd->cc);
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to register interrupt callback "
-			"for control fd", rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to register interrupt callback for control fd");
 		goto error;
 	}

diff --git a/drivers/net/memif/meson.build b/drivers/net/memif/meson.build
index bedc97311..6b62255f9 100644
--- a/drivers/net/memif/meson.build
+++ b/drivers/net/memif/meson.build
@@ -14,5 +14,6 @@ allow_experimental_apis = true
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments

 deps += ['hash']
diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index a59f80967..47c6ea0f3 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -23,6 +23,10 @@
 #include <rte_kvargs.h>
 #include <rte_bus_vdev.h>
 #include <rte_string_fns.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_eal_memconfig.h>

 #include "rte_eth_memif.h"
 #include "memif_socket.h"
@@ -50,6 +54,10 @@ static const char * const valid_arguments[] = {

 #define MEMIF_MP_SEND_REGION		"memif_mp_send_region"

+
+static int memif_region_init_zc(const struct rte_memseg_list *msl,
+				const struct rte_memseg *ms, void *arg);
+
 const char *
 memif_version(void)
 {
@@ -116,10 +124,14 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 	struct mp_region_msg *reply_param;
 	struct memif_region *r;
 	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
+	/* in case of zero-copy slave, only request region 0 */
+	uint16_t max_region_num = (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) ?
+				   1 : ETH_MEMIF_MAX_REGION_NUM;

 	MIF_LOG(DEBUG, "Requesting memory regions");

-	for (i = 0; i < ETH_MEMIF_MAX_REGION_NUM; i++) {
+	for (i = 0; i < max_region_num; i++) {
 		/* Prepare the message */
 		memset(&msg, 0, sizeof(msg));
 		strlcpy(msg.name, MEMIF_MP_SEND_REGION, sizeof(msg.name));
@@ -161,6 +173,12 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 		free(reply);
 	}

+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)proc_private);
+		if (ret < 0)
+			return ret;
+	}
+
 	return memif_connect(dev);
 }

@@ -218,6 +236,23 @@ memif_get_buffer(struct pmd_process_private *proc_private, memif_desc_t *d)
 	return ((uint8_t *)proc_private->regions[d->region]->addr + d->offset);
 }

+/* Free mbufs received by master */
+static void
+memif_free_stored_mbufs(struct pmd_process_private *proc_private, struct memif_queue *mq)
+{
+	uint16_t mask = (1 << mq->log2_ring_size) - 1;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+
+	/* FIXME: improve performance */
+	while (mq->last_tail != ring->tail) {
+		RTE_MBUF_PREFETCH_TO_FREE(mq->buffers[(mq->last_tail + 1) & mask]);
+		/* Decrement refcnt and free mbuf. (current segment) */
+		rte_mbuf_refcnt_update(mq->buffers[mq->last_tail & mask], -1);
+		rte_pktmbuf_free_seg(mq->buffers[mq->last_tail & mask]);
+		mq->last_tail++;
+	}
+}
+
 static int
 memif_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *cur_tail,
 		    struct rte_mbuf *tail)
@@ -323,8 +358,8 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;

 			memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, dst_off),
-			       (uint8_t *)memif_get_buffer(proc_private, d0) +
-			       src_off, cp_len);
+			       (uint8_t *)memif_get_buffer(proc_private, d0) + src_off,
+			       cp_len);

 			src_off += cp_len;
 			dst_off += cp_len;
@@ -369,6 +404,120 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_rx_pkts;
 }

+static uint16_t
+eth_memif_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0, head;
+	uint16_t n_rx_pkts = 0;
+	memif_desc_t *d0;
+	struct rte_mbuf *mbuf, *mbuf_tail;
+	struct rte_mbuf *mbuf_head = NULL;
+	int ret;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	/* consume interrupt */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t b;
+		ssize_t size __rte_unused;
+		size = read(mq->intr_handle.fd, &b, sizeof(b));
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	cur_slot = mq->last_tail;
+	last_slot = ring->tail;
+	if (cur_slot == last_slot)
+		goto refill;
+	n_slots = last_slot - cur_slot;
+
+	while (n_slots && n_rx_pkts < nb_pkts) {
+		s0 = cur_slot & mask;
+
+		d0 = &ring->desc[s0];
+		mbuf_head = mq->buffers[s0];
+		mbuf = mbuf_head;
+
+next_slot:
+		/* prefetch next descriptor */
+		if (n_rx_pkts + 1 < nb_pkts)
+			rte_prefetch0(&ring->desc[(cur_slot + 1) & mask]);
+
+		mbuf->port = mq->in_port;
+		rte_pktmbuf_data_len(mbuf) = d0->length;
+		rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+
+		mq->n_bytes += rte_pktmbuf_data_len(mbuf);
+
+		cur_slot++;
+		n_slots--;
+		if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
+			mbuf_tail = mbuf;
+			mbuf = mq->buffers[s0];
+			ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+			if (unlikely(ret < 0)) {
+				MIF_LOG(ERR, "number-of-segments-overflow");
+				goto refill;
+			}
+			goto next_slot;
+		}
+
+		*bufs++ = mbuf_head;
+		n_rx_pkts++;
+	}
+
+	mq->last_tail = cur_slot;
+
+/* Supply master with new buffers */
+refill:
+	head = ring->head;
+	n_slots = ring_size - head + mq->last_tail;
+
+	if (n_slots < 32)
+		goto no_free_mbufs;
+
+	ret = rte_pktmbuf_alloc_bulk(mq->mempool, &mq->buffers[head & mask], n_slots);
+	if (unlikely(ret < 0))
+		goto no_free_mbufs;
+
+	while (n_slots--) {
+		s0 = head++ & mask;
+		if (n_slots > 0)
+			rte_prefetch0(mq->buffers[head & mask]);
+		d0 = &ring->desc[s0];
+		/* store buffer header */
+		mbuf = mq->buffers[s0];
+		/* populate descriptor */
+		d0->length = rte_pktmbuf_data_room_size(mq->mempool) -
+				RTE_PKTMBUF_HEADROOM;
+		d0->region = 1;
+		d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+			(uint8_t *)proc_private->regions[d0->region]->addr;
+	}
+no_free_mbufs:
+	rte_mb();
+	ring->head = head;
+
+	mq->n_pkts += n_rx_pkts;
+
+	return n_rx_pkts;
+}
+
 static uint16_t
 eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
@@ -483,19 +632,173 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }

+
+static int
+memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
+		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
+		uint16_t slot, uint16_t n_free)
+{
+	memif_desc_t *d0;
+	int used_slots = 1;
+
+next_in_chain:
+	/* store pointer to mbuf to free it later */
+	mq->buffers[slot & mask] = mbuf;
+	/* Increment refcnt to make sure the buffer is not freed before master
+	 * receives it. (current segment)
+	 */
+	rte_mbuf_refcnt_update(mbuf, 1);
+	/* populate descriptor */
+	d0 = &ring->desc[slot & mask];
+	d0->length = rte_pktmbuf_data_len(mbuf);
+	/* FIXME: get region index */
+	d0->region = 1;
+	d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+		(uint8_t *)proc_private->regions[d0->region]->addr;
+	d0->flags = 0;
+
+	/* check if buffer is chained */
+	if (rte_pktmbuf_is_contiguous(mbuf) == 0) {
+		if (n_free < 2)
+			return 0;
+		/* mark buffer as chained */
+		d0->flags |= MEMIF_DESC_FLAG_NEXT;
+		/* advance mbuf */
+		mbuf = mbuf->next;
+		/* update counters */
+		used_slots++;
+		slot++;
+		n_free--;
+		goto next_in_chain;
+	}
+	return used_slots;
+}
+
+static uint16_t
+eth_memif_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t slot, n_free, ring_size, mask, n_tx_pkts = 0;
+	memif_ring_type_t type = mq->type;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	/* free mbufs received by master */
+	memif_free_stored_mbufs(proc_private, mq);
+
+	/* ring type always MEMIF_RING_S2M */
+	slot = ring->head;
+	n_free = ring_size - ring->head + mq->last_tail;
+
+	int used_slots;
+
+	while (n_free && (n_tx_pkts < nb_pkts)) {
+		while ((n_free > 4) && ((nb_pkts - n_tx_pkts) > 4)) {
+			if ((nb_pkts - n_tx_pkts) > 8) {
+				rte_prefetch0(*bufs + 4);
+				rte_prefetch0(*bufs + 5);
+				rte_prefetch0(*bufs + 6);
+				rte_prefetch0(*bufs + 7);
+			}
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+		}
+		used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+			mask, slot, n_free);
+		if (unlikely(used_slots < 1))
+			goto no_free_slots;
+		n_tx_pkts++;
+		slot += used_slots;
+		n_free -= used_slots;
+	}
+
+no_free_slots:
+	rte_mb();
+	/* update ring pointers */
+	if (type == MEMIF_RING_S2M)
+		ring->head = slot;
+	else
+		ring->tail = slot;
+
+	/* Send interrupt, if enabled. */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t a = 1;
+		ssize_t size = write(mq->intr_handle.fd, &a, sizeof(a));
+		if (unlikely(size < 0)) {
+			MIF_LOG(WARNING,
+				"Failed to send interrupt. %s", strerror(errno));
+		}
+	}
+
+	/* increment queue counters */
+	mq->n_pkts += n_tx_pkts;
+
+	return n_tx_pkts;
+}
+
 void
-memif_free_regions(struct pmd_process_private *proc_private)
+memif_free_regions(struct rte_eth_dev *dev)
 {
+	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int i;
 	struct memif_region *r;

-	MIF_LOG(DEBUG, "Free memory regions");
 	/* regions are allocated contiguously, so it's
 	 * enough to loop until 'proc_private->regions_num'
 	 */
 	for (i = 0; i < proc_private->regions_num; i++) {
 		r = proc_private->regions[i];
 		if (r != NULL) {
+			/* This is memzone */
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				r->addr = NULL;
+				if (r->fd > 0)
+					close(r->fd);
+			}
 			if (r->addr != NULL) {
 				munmap(r->addr, r->region_size);
 				if (r->fd > 0) {
@@ -510,6 +813,45 @@ memif_free_regions(struct pmd_process_private *proc_private)
 	proc_private->regions_num = 0;
 }

+static int
+memif_region_init_zc(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+		     void *arg)
+{
+	struct pmd_process_private *proc_private = (struct pmd_process_private *)arg;
+	struct memif_region *r;
+
+	if (proc_private->regions_num < 1) {
+		MIF_LOG(ERR, "Missing descriptor region");
+		return -1;
+	}
+
+	r = proc_private->regions[proc_private->regions_num - 1];
+
+	if (r->addr != msl->base_va)
+		r = proc_private->regions[++proc_private->regions_num - 1];
+
+	if (r == NULL) {
+		r = rte_zmalloc("region", sizeof(struct memif_region), 0);
+		if (r == NULL) {
+			MIF_LOG(ERR, "Failed to alloc memif region.");
+			return -ENOMEM;
+		}
+
+		r->addr = msl->base_va;
+		r->region_size = ms->len;
+		r->fd = rte_memseg_get_fd(ms);
+		if (r->fd < 0)
+			return -1;
+		r->pkt_buffer_offset = 0;
+
+		proc_private->regions[proc_private->regions_num - 1] = r;
+	} else {
+		r->region_size += ms->len;
+	}
+
+	return 0;
+}
+
 static int
 memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 {
@@ -590,12 +932,29 @@ memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 static int
 memif_regions_init(struct rte_eth_dev *dev)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int ret;

-	/* create one buffer region */
-	ret = memif_region_init_shm(dev, /* has buffer */ 1);
-	if (ret < 0)
-		return ret;
+	/*
+	 * Zero-copy exposes dpdk memory.
+	 * Each memseg list will be represented by memif region.
+	 * Zero-copy regions indexing: memseg list idx + 1,
+	 * as we already have region 0 reserved for descriptors.
+	 */
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		/* create region idx 0 containing descriptors */
+		ret = memif_region_init_shm(dev, 0);
+		if (ret < 0)
+			return ret;
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)dev->process_private);
+		if (ret < 0)
+			return ret;
+	} else {
+		/* create one memory region contaning rings and buffers */
+		ret = memif_region_init_shm(dev, /* has buffers */ 1);
+		if (ret < 0)
+			return ret;
+	}

 	return 0;
 }
@@ -615,6 +974,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		ring->tail = 0;
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = i * (1 << pmd->run.log2_ring_size) + j;
 			ring->desc[j].region = 0;
@@ -631,6 +994,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		ring->tail = 0;
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = (i + pmd->run.num_s2m_rings) *
 			    (1 << pmd->run.log2_ring_size) + j;
@@ -644,7 +1011,7 @@ memif_init_rings(struct rte_eth_dev *dev)
 }

 /* called only by slave */
-static void
+static int
 memif_init_queues(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
@@ -665,6 +1032,13 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for tx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}

 	for (i = 0; i < pmd->run.num_m2s_rings; i++) {
@@ -681,7 +1055,15 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for rx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}
+	return 0;
 }

 int
@@ -695,7 +1077,9 @@ memif_init_regions_and_queues(struct rte_eth_dev *dev)

 	memif_init_rings(dev);

-	memif_init_queues(dev);
+	ret = memif_init_queues(dev);
+	if (ret < 0)
+		return ret;

 	return 0;
 }
@@ -719,8 +1103,16 @@ memif_connect(struct rte_eth_dev *dev)
 				mr->addr = mmap(NULL, mr->region_size,
 						PROT_READ | PROT_WRITE,
 						MAP_SHARED, mr->fd, 0);
-				if (mr->addr == NULL)
+				if (mr->addr == MAP_FAILED) {
+					MIF_LOG(ERR, "mmap failed: %s\n",
+						strerror(errno));
 					return -1;
+				}
+			}
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				/* close memseg file */
+				close(mr->fd);
+				mr->fd = -1;
 			}
 		}
 	}
@@ -781,8 +1173,7 @@ memif_dev_start(struct rte_eth_dev *dev)
 		ret = memif_connect_master(dev);
 		break;
 	default:
-		MIF_LOG(ERR, "%s: Unknown role: %d.",
-			rte_vdev_device_name(pmd->vdev), pmd->role);
+		MIF_LOG(ERR, "Unknown role: %d.", pmd->role);
 		ret = -1;
 		break;
 	}
@@ -847,8 +1238,7 @@ memif_tx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("tx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate tx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate tx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -876,8 +1266,7 @@ memif_rx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("rx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate rx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate rx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -917,7 +1306,7 @@ memif_link_update(struct rte_eth_dev *dev,
 			memif_mp_request_regions(dev);
 		} else if (dev->data->dev_link.link_status == ETH_LINK_DOWN &&
 				proc_private->regions_num > 0) {
-			memif_free_regions(proc_private);
+			memif_free_regions(dev);
 		}
 	}
 	return 0;
@@ -1036,11 +1425,6 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	const unsigned int numa_node = vdev->device.numa_node;
 	const char *name = rte_vdev_device_name(vdev);

-	if (flags & ETH_MEMIF_FLAG_ZERO_COPY) {
-		MIF_LOG(ERR, "Zero-copy slave not supported.");
-		return -1;
-	}
-
 	eth_dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
 	if (eth_dev == NULL) {
 		MIF_LOG(ERR, "%s: Unable to allocate device struct.", name);
@@ -1064,6 +1448,9 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	pmd->flags = flags;
 	pmd->flags |= ETH_MEMIF_FLAG_DISABLED;
 	pmd->role = role;
+	/* Zero-copy flag irelevant to master. */
+	if (pmd->role == MEMIF_ROLE_MASTER)
+		pmd->flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;

 	ret = memif_socket_init(eth_dev, socket_filename);
 	if (ret < 0)
@@ -1087,8 +1474,14 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,

 	eth_dev->dev_ops = &ops;
 	eth_dev->device = &vdev->device;
-	eth_dev->rx_pkt_burst = eth_memif_rx;
-	eth_dev->tx_pkt_burst = eth_memif_tx;
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		eth_dev->rx_pkt_burst = eth_memif_rx_zc;
+		eth_dev->tx_pkt_burst = eth_memif_tx_zc;
+	} else {
+		eth_dev->rx_pkt_burst = eth_memif_rx;
+		eth_dev->tx_pkt_burst = eth_memif_tx;
+	}
+

 	eth_dev->data->dev_flags &= RTE_ETH_DEV_CLOSE_REMOVE;

@@ -1120,6 +1513,10 @@ memif_set_zc(const char *key __rte_unused, const char *value, void *extra_args)
 	uint32_t *flags = (uint32_t *)extra_args;

 	if (strstr(value, "yes") != NULL) {
+		if (!rte_mcfg_get_single_file_segments()) {
+			MIF_LOG(ERR, "Zero-copy doesn't support multi-file segments.");
+			return -ENOTSUP;
+		}
 		*flags |= ETH_MEMIF_FLAG_ZERO_COPY;
 	} else if (strstr(value, "no") != NULL) {
 		*flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
diff --git a/drivers/net/memif/rte_eth_memif.h b/drivers/net/memif/rte_eth_memif.h
index 8269212cb..0d2566392 100644
--- a/drivers/net/memif/rte_eth_memif.h
+++ b/drivers/net/memif/rte_eth_memif.h
@@ -63,12 +63,15 @@ struct memif_queue {
 	uint16_t last_head;			/**< last ring head */
 	uint16_t last_tail;			/**< last ring tail */

+	struct rte_mbuf **buffers;
+	/**< Stored mbufs. Used in zero-copy tx. Slave stores transmitted
+	 * mbufs to free them once master has received them.
+	 */
+
 	/* rx/tx info */
 	uint64_t n_pkts;			/**< number of rx/tx packets */
 	uint64_t n_bytes;			/**< number of rx/tx bytes */

-	memif_ring_t *ring;			/**< pointer to ring */
-
 	struct rte_intr_handle intr_handle;	/**< interrupt handle */

 	memif_log2_ring_size_t log2_ring_size;	/**< log2 of ring size */
@@ -115,8 +118,6 @@ struct pmd_internals {
 	/**< local disconnect reason */
 	char remote_disc_string[ETH_MEMIF_DISC_STRING_SIZE];
 	/**< remote disconnect reason */
-
-	struct rte_vdev_device *vdev;		/**< vdev handle */
 };

 struct pmd_process_private {
@@ -131,7 +132,7 @@ struct pmd_process_private {
  * @param proc_private
  *   device process private data
  */
-void memif_free_regions(struct pmd_process_private *proc_private);
+void memif_free_regions(struct rte_eth_dev *dev);

 /**
  * Finalize connection establishment process. Map shared memory file
diff --git a/lib/librte_eal/common/eal_common_mcfg.c b/lib/librte_eal/common/eal_common_mcfg.c
index 066549432..03d9d472d 100644
--- a/lib/librte_eal/common/eal_common_mcfg.c
+++ b/lib/librte_eal/common/eal_common_mcfg.c
@@ -161,3 +161,10 @@ rte_mcfg_timer_unlock(void)
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	rte_spinlock_unlock(&mcfg->tlock);
 }
+
+uint32_t
+rte_mcfg_get_single_file_segments(void)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	return mcfg->single_file_segments;
+}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 34b0e44a0..9bb4a57f8 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -109,6 +109,16 @@ __rte_experimental
 void
 rte_mcfg_timer_unlock(void);

+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Get the single_file_segments parameter value from memory configuration.
+ */
+__rte_experimental
+uint32_t
+rte_mcfg_get_single_file_segments(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7cbf82d37..c2b9d473f 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -418,5 +418,6 @@ EXPERIMENTAL {
 	rte_lcore_to_cpu_id;
 	rte_mcfg_timer_lock;
 	rte_mcfg_timer_unlock;
+	rte_mcfg_get_single_file_segments;
 	rte_rand_max;
 };
--
2.17.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-08-22  8:18 ` [dpdk-dev] [PATCH v5] " Jakub Grajciar
@ 2019-10-04 13:23   ` Ferruh Yigit
  2019-10-15 16:59     ` Ferruh Yigit
  2019-10-25 16:44   ` Yigit, Ferruh
  2019-11-04 11:03   ` [dpdk-dev] [PATCH v6] " Jakub Grajciar
  2 siblings, 1 reply; 20+ messages in thread
From: Ferruh Yigit @ 2019-10-04 13:23 UTC (permalink / raw)
  To: Jakub Grajciar, dev

On 8/22/2019 9:18 AM, Jakub Grajciar wrote:
> Zero-copy slave support for memif PMD.
> Slave interface exposes DPDK memory to
> master interface. Only single file segments
> are supported (EAL option --single-file-segments).
> 
> Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
> ---
>  doc/guides/nics/memif.rst                     |  42 +-
>  drivers/net/memif/Makefile                    |   1 +
>  drivers/net/memif/memif_socket.c              |  64 +--
>  drivers/net/memif/meson.build                 |   1 +
>  drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
>  drivers/net/memif/rte_eth_memif.h             |  11 +-
>  lib/librte_eal/common/eal_common_mcfg.c       |   7 +
>  .../common/include/rte_eal_memconfig.h        |  10 +
>  lib/librte_eal/rte_eal_version.map            |   1 +
>  9 files changed, 513 insertions(+), 73 deletions(-)
> 
> V2:
> - fix coding style
> 
> V3:
> - fix compilation issues
> 
> V4:
> - don't move existing code
> - add new EAL API rte_mcfg_get_single_file_segments,
>   mem_config is now private, this api returns
>   single_file_segments parameter value
> 
> V5:
> - explain single file segments limitation
> - add zero-copy slave example

Overall looks good, but I had to test this by manually modifying the PMD for the
bind() error.

I am for first fixing the PMD bind() issue before getting this patch, fyi.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-04 13:23   ` Ferruh Yigit
@ 2019-10-15 16:59     ` Ferruh Yigit
  2019-10-17 11:52       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  0 siblings, 1 reply; 20+ messages in thread
From: Ferruh Yigit @ 2019-10-15 16:59 UTC (permalink / raw)
  To: Jakub Grajciar, dev; +Cc: Stephen Hemminger

On 10/4/2019 2:23 PM, Ferruh Yigit wrote:
> On 8/22/2019 9:18 AM, Jakub Grajciar wrote:
>> Zero-copy slave support for memif PMD.
>> Slave interface exposes DPDK memory to
>> master interface. Only single file segments
>> are supported (EAL option --single-file-segments).
>>
>> Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
>> ---
>>  doc/guides/nics/memif.rst                     |  42 +-
>>  drivers/net/memif/Makefile                    |   1 +
>>  drivers/net/memif/memif_socket.c              |  64 +--
>>  drivers/net/memif/meson.build                 |   1 +
>>  drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
>>  drivers/net/memif/rte_eth_memif.h             |  11 +-
>>  lib/librte_eal/common/eal_common_mcfg.c       |   7 +
>>  .../common/include/rte_eal_memconfig.h        |  10 +
>>  lib/librte_eal/rte_eal_version.map            |   1 +
>>  9 files changed, 513 insertions(+), 73 deletions(-)
>>
>> V2:
>> - fix coding style
>>
>> V3:
>> - fix compilation issues
>>
>> V4:
>> - don't move existing code
>> - add new EAL API rte_mcfg_get_single_file_segments,
>>   mem_config is now private, this api returns
>>   single_file_segments parameter value
>>
>> V5:
>> - explain single file segments limitation
>> - add zero-copy slave example
> 
> Overall looks good, but I had to test this by manually modifying the PMD for the
> bind() error.
> 
> I am for first fixing the PMD bind() issue before getting this patch, fyi.
> 

Hi Jakub,

Just to double check if anyone is looking into the bind() error issue which is
since following commit, I am waiting for more input on it.

Commit b923866c6974 ("net/memif: allow for full key size in socket name")
Cc: stephen@networkplumber.org


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-15 16:59     ` Ferruh Yigit
@ 2019-10-17 11:52       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  2019-10-17 16:04         ` Ferruh Yigit
  0 siblings, 1 reply; 20+ messages in thread
From: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco) @ 2019-10-17 11:52 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: Stephen Hemminger


> Hi Jakub,
> 
> Just to double check if anyone is looking into the bind() error issue which is
> since following commit, I am waiting for more input on it.
> 
> Commit b923866c6974 ("net/memif: allow for full key size in socket name")
> Cc: stephen@networkplumber.org

Definitely an issue, I must have messed something up while applying the patch for testing, as it's not working for me either. I looked into bind() and it seems that the size has upper limit. Are you able to revert the patch?

As for the issue that patch addresses, there is an incorrect information in the docs that 'socket' parameter length is 256b. It should be 108b as that is the limitation on Linux.

Jakub

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-17 11:52       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
@ 2019-10-17 16:04         ` Ferruh Yigit
  2019-10-17 16:45           ` Ferruh Yigit
  0 siblings, 1 reply; 20+ messages in thread
From: Ferruh Yigit @ 2019-10-17 16:04 UTC (permalink / raw)
  To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco), dev
  Cc: Stephen Hemminger

On 10/17/2019 12:52 PM, Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at
Cisco) wrote:
> 
>> Hi Jakub,
>>
>> Just to double check if anyone is looking into the bind() error issue which is
>> since following commit, I am waiting for more input on it.
>>
>> Commit b923866c6974 ("net/memif: allow for full key size in socket name")
>> Cc: stephen@networkplumber.org
> 
> Definitely an issue, I must have messed something up while applying the patch for testing, as it's not working for me either. I looked into bind() and it seems that the size has upper limit. Are you able to revert the patch?

+1 to logically revert the patch, but actually I think it is easier to do an
incremental patch on top of current head to fix the issue.

The patch replaces hard-coded 'key_len' to a macro which is good to keep, also
number of lines needs to be changed for fix looks less than number of lines will
 be changed by revert J

> 
> As for the issue that patch addresses, there is an incorrect information in the docs that 'socket' parameter length is 256b. It should be 108b as that is the limitation on Linux.

I don't know the doc but code has it as 256b [1] and the issue patch addresses
looks like a valid issue. If you are OK to limit the 'key_len' to 108 [2], the
fix is really easy.


[1]
http://lxr.dpdk.org/dpdk/v19.08/source/drivers/net/memif/memif_socket.h#L84

[2]
If I remember correctly the suggested length was 99 for portability, it seems
different OS has this value different and 99 is the smallest ...

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-17 16:04         ` Ferruh Yigit
@ 2019-10-17 16:45           ` Ferruh Yigit
  2019-10-21 16:07             ` Ferruh Yigit
  0 siblings, 1 reply; 20+ messages in thread
From: Ferruh Yigit @ 2019-10-17 16:45 UTC (permalink / raw)
  To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  Cc: dev, Stephen Hemminger

On 10/17/2019 5:04 PM, Ferruh Yigit wrote:
> On 10/17/2019 12:52 PM, Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at
> Cisco) wrote:
>>
>>> Hi Jakub,
>>>
>>> Just to double check if anyone is looking into the bind() error issue which is
>>> since following commit, I am waiting for more input on it.
>>>
>>> Commit b923866c6974 ("net/memif: allow for full key size in socket name")
>>> Cc: stephen@networkplumber.org
>>
>> Definitely an issue, I must have messed something up while applying the patch for testing, as it's not working for me either. I looked into bind() and it seems that the size has upper limit. Are you able to revert the patch?
> 
> +1 to logically revert the patch, but actually I think it is easier to do an
> incremental patch on top of current head to fix the issue.
> 
> The patch replaces hard-coded 'key_len' to a macro which is good to keep, also
> number of lines needs to be changed for fix looks less than number of lines will
>  be changed by revert J
> 
>>
>> As for the issue that patch addresses, there is an incorrect information in the docs that 'socket' parameter length is 256b. It should be 108b as that is the limitation on Linux.
> 
> I don't know the doc but code has it as 256b [1] and the issue patch addresses
> looks like a valid issue. If you are OK to limit the 'key_len' to 108 [2], the
> fix is really easy.

Hi Jakub,

Something like following seems fixing the issue, can you please make a patch for
the fix?



 diff --git a/drivers/net/memif/memif_socket.c b/drivers/net/memif/memif_socket.c
 index 0c71f6c454..aa4f549f2c 100644
 --- a/drivers/net/memif/memif_socket.c
 +++ b/drivers/net/memif/memif_socket.c
 @@ -868,8 +868,7 @@ memif_socket_create(struct pmd_internals *pmd,
                     const char *key, uint8_t listener)
  {
         struct memif_socket *sock;
 -       struct sockaddr_un *un;
 -       char un_buf[MEMIF_SOCKET_UN_SIZE];
 +       struct sockaddr_un un;
         int sockfd;
         int ret;
         int on = 1;
 @@ -889,16 +888,14 @@ memif_socket_create(struct pmd_internals *pmd,
                 if (sockfd < 0)
                         goto error;

 -               memset(un_buf, 0, sizeof(un_buf));
 -               un = (struct sockaddr_un *)un_buf;
 -               un->sun_family = AF_UNIX;
 -               strlcpy(un->sun_path, sock->filename, MEMIF_SOCKET_KEY_LEN);
 +               un.sun_family = AF_UNIX;
 +               strlcpy(un.sun_path, sock->filename, MEMIF_SOCKET_KEY_LEN);

                 ret = setsockopt(sockfd, SOL_SOCKET, SO_PASSCRED, &on,
                                  sizeof(on));
                 if (ret < 0)
                         goto error;
 -               ret = bind(sockfd, (struct sockaddr *)un,  MEMIF_SOCKET_UN_SIZE);
 +               ret = bind(sockfd, (struct sockaddr *)&un, sizeof(struct
sockaddr_un));
                 if (ret < 0)
                         goto error;
                 ret = listen(sockfd, 1);
 diff --git a/drivers/net/memif/memif_socket.h b/drivers/net/memif /memif_socket.h
 index 9f40f8d138..12fa123230 100644
 --- a/drivers/net/memif/memif_socket.h
 +++ b/drivers/net/memif/memif_socket.h
 @@ -79,7 +79,7 @@ struct memif_socket_dev_list_elt {
  };

  #define MEMIF_SOCKET_HASH_NAME                 "memif-sh"
 -#define MEMIF_SOCKET_KEY_LEN           256
 +#define MEMIF_SOCKET_KEY_LEN           100

  struct memif_socket {
         struct rte_intr_handle intr_handle;     /**< interrupt handle */



> 
> 
> [1]
> http://lxr.dpdk.org/dpdk/v19.08/source/drivers/net/memif/memif_socket.h#L84
> 
> [2]
> If I remember correctly the suggested length was 99 for portability, it seems
> different OS has this value different and 99 is the smallest ...
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-17 16:45           ` Ferruh Yigit
@ 2019-10-21 16:07             ` Ferruh Yigit
  0 siblings, 0 replies; 20+ messages in thread
From: Ferruh Yigit @ 2019-10-21 16:07 UTC (permalink / raw)
  To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  Cc: dev, Stephen Hemminger, damarion

On 10/17/2019 5:45 PM, Ferruh Yigit wrote:
> On 10/17/2019 5:04 PM, Ferruh Yigit wrote:
>> On 10/17/2019 12:52 PM, Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at
>> Cisco) wrote:
>>>
>>>> Hi Jakub,
>>>>
>>>> Just to double check if anyone is looking into the bind() error issue which is
>>>> since following commit, I am waiting for more input on it.
>>>>
>>>> Commit b923866c6974 ("net/memif: allow for full key size in socket name")
>>>> Cc: stephen@networkplumber.org
>>>
>>> Definitely an issue, I must have messed something up while applying the patch for testing, as it's not working for me either. I looked into bind() and it seems that the size has upper limit. Are you able to revert the patch?
>>
>> +1 to logically revert the patch, but actually I think it is easier to do an
>> incremental patch on top of current head to fix the issue.
>>
>> The patch replaces hard-coded 'key_len' to a macro which is good to keep, also
>> number of lines needs to be changed for fix looks less than number of lines will
>>  be changed by revert J
>>
>>>
>>> As for the issue that patch addresses, there is an incorrect information in the docs that 'socket' parameter length is 256b. It should be 108b as that is the limitation on Linux.
>>
>> I don't know the doc but code has it as 256b [1] and the issue patch addresses
>> looks like a valid issue. If you are OK to limit the 'key_len' to 108 [2], the
>> fix is really easy.
> 
> Hi Jakub,
> 
> Something like following seems fixing the issue, can you please make a patch for
> the fix?

Ping.

Can it possible to make this on time so rc1 doesn't released with broken PMD?

Thanks,
ferruh

> 
> 
> 
>  diff --git a/drivers/net/memif/memif_socket.c b/drivers/net/memif/memif_socket.c
>  index 0c71f6c454..aa4f549f2c 100644
>  --- a/drivers/net/memif/memif_socket.c
>  +++ b/drivers/net/memif/memif_socket.c
>  @@ -868,8 +868,7 @@ memif_socket_create(struct pmd_internals *pmd,
>                      const char *key, uint8_t listener)
>   {
>          struct memif_socket *sock;
>  -       struct sockaddr_un *un;
>  -       char un_buf[MEMIF_SOCKET_UN_SIZE];
>  +       struct sockaddr_un un;
>          int sockfd;
>          int ret;
>          int on = 1;
>  @@ -889,16 +888,14 @@ memif_socket_create(struct pmd_internals *pmd,
>                  if (sockfd < 0)
>                          goto error;
> 
>  -               memset(un_buf, 0, sizeof(un_buf));
>  -               un = (struct sockaddr_un *)un_buf;
>  -               un->sun_family = AF_UNIX;
>  -               strlcpy(un->sun_path, sock->filename, MEMIF_SOCKET_KEY_LEN);
>  +               un.sun_family = AF_UNIX;
>  +               strlcpy(un.sun_path, sock->filename, MEMIF_SOCKET_KEY_LEN);
> 
>                  ret = setsockopt(sockfd, SOL_SOCKET, SO_PASSCRED, &on,
>                                   sizeof(on));
>                  if (ret < 0)
>                          goto error;
>  -               ret = bind(sockfd, (struct sockaddr *)un,  MEMIF_SOCKET_UN_SIZE);
>  +               ret = bind(sockfd, (struct sockaddr *)&un, sizeof(struct
> sockaddr_un));
>                  if (ret < 0)
>                          goto error;
>                  ret = listen(sockfd, 1);
>  diff --git a/drivers/net/memif/memif_socket.h b/drivers/net/memif /memif_socket.h
>  index 9f40f8d138..12fa123230 100644
>  --- a/drivers/net/memif/memif_socket.h
>  +++ b/drivers/net/memif/memif_socket.h
>  @@ -79,7 +79,7 @@ struct memif_socket_dev_list_elt {
>   };
> 
>   #define MEMIF_SOCKET_HASH_NAME                 "memif-sh"
>  -#define MEMIF_SOCKET_KEY_LEN           256
>  +#define MEMIF_SOCKET_KEY_LEN           100
> 
>   struct memif_socket {
>          struct rte_intr_handle intr_handle;     /**< interrupt handle */
> 
> 
> 
>>
>>
>> [1]
>> http://lxr.dpdk.org/dpdk/v19.08/source/drivers/net/memif/memif_socket.h#L84
>>
>> [2]
>> If I remember correctly the suggested length was 99 for portability, it seems
>> different OS has this value different and 99 is the smallest ...
>>
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-08-22  8:18 ` [dpdk-dev] [PATCH v5] " Jakub Grajciar
  2019-10-04 13:23   ` Ferruh Yigit
@ 2019-10-25 16:44   ` Yigit, Ferruh
  2019-10-29 14:28     ` David Marchand
  2019-11-04 11:03   ` [dpdk-dev] [PATCH v6] " Jakub Grajciar
  2 siblings, 1 reply; 20+ messages in thread
From: Yigit, Ferruh @ 2019-10-25 16:44 UTC (permalink / raw)
  To: Jakub Grajciar, dev, David Marchand

On 8/22/2019 9:18 AM, Jakub Grajciar wrote:
> Zero-copy slave support for memif PMD.
> Slave interface exposes DPDK memory to
> master interface. Only single file segments
> are supported (EAL option --single-file-segments).
> 
> Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>

Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

Since bind() issue solved, we can continue with the patch.

<...>

> @@ -131,7 +132,7 @@ struct pmd_process_private {
>   * @param proc_private
>   *   device process private data
>   */
> -void memif_free_regions(struct pmd_process_private *proc_private);
> +void memif_free_regions(struct rte_eth_dev *dev);
> 
>  /**
>   * Finalize connection establishment process. Map shared memory file
> diff --git a/lib/librte_eal/common/eal_common_mcfg.c b/lib/librte_eal/common/eal_common_mcfg.c
> index 066549432..03d9d472d 100644
> --- a/lib/librte_eal/common/eal_common_mcfg.c
> +++ b/lib/librte_eal/common/eal_common_mcfg.c
> @@ -161,3 +161,10 @@ rte_mcfg_timer_unlock(void)
>  	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
>  	rte_spinlock_unlock(&mcfg->tlock);
>  }
> +
> +uint32_t
> +rte_mcfg_get_single_file_segments(void)
> +{
> +	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> +	return mcfg->single_file_segments;
> +}
> diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
> index 34b0e44a0..9bb4a57f8 100644
> --- a/lib/librte_eal/common/include/rte_eal_memconfig.h
> +++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
> @@ -109,6 +109,16 @@ __rte_experimental
>  void
>  rte_mcfg_timer_unlock(void);
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Get the single_file_segments parameter value from memory configuration.
> + */
> +__rte_experimental
> +uint32_t
> +rte_mcfg_get_single_file_segments(void);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index 7cbf82d37..c2b9d473f 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -418,5 +418,6 @@ EXPERIMENTAL {
>  	rte_lcore_to_cpu_id;
>  	rte_mcfg_timer_lock;
>  	rte_mcfg_timer_unlock;
> +	rte_mcfg_get_single_file_segments;

This should be moved to 19.11 block in experimental

cc'ed Dave for eal part,
@Dave, change looks straight forward but can you please check/comment?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-25 16:44   ` Yigit, Ferruh
@ 2019-10-29 14:28     ` David Marchand
  2019-10-30 10:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
  0 siblings, 1 reply; 20+ messages in thread
From: David Marchand @ 2019-10-29 14:28 UTC (permalink / raw)
  To: Yigit, Ferruh, Jakub Grajciar; +Cc: dev, Burakov, Anatoly

On Fri, Oct 25, 2019 at 6:45 PM Yigit, Ferruh
<ferruh.yigit@linux.intel.com> wrote:
>
> On 8/22/2019 9:18 AM, Jakub Grajciar wrote:
> > Zero-copy slave support for memif PMD.
> > Slave interface exposes DPDK memory to
> > master interface. Only single file segments
> > are supported (EAL option --single-file-segments).

Do you really want this additional configuration in your driver or
can't you enable/disable the functional
> >
> > Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
>
> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>
>
> Since bind() issue solved, we can continue with the patch.
>
> <...>
>
> > @@ -131,7 +132,7 @@ struct pmd_process_private {
> >   * @param proc_private
> >   *   device process private data
> >   */
> > -void memif_free_regions(struct pmd_process_private *proc_private);
> > +void memif_free_regions(struct rte_eth_dev *dev);
> >
> >  /**
> >   * Finalize connection establishment process. Map shared memory file
> > diff --git a/lib/librte_eal/common/eal_common_mcfg.c b/lib/librte_eal/common/eal_common_mcfg.c
> > index 066549432..03d9d472d 100644
> > --- a/lib/librte_eal/common/eal_common_mcfg.c
> > +++ b/lib/librte_eal/common/eal_common_mcfg.c
> > @@ -161,3 +161,10 @@ rte_mcfg_timer_unlock(void)
> >       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> >       rte_spinlock_unlock(&mcfg->tlock);
> >  }
> > +
> > +uint32_t
> > +rte_mcfg_get_single_file_segments(void)
> > +{
> > +     struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> > +     return mcfg->single_file_segments;
> > +}
> > diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
> > index 34b0e44a0..9bb4a57f8 100644
> > --- a/lib/librte_eal/common/include/rte_eal_memconfig.h
> > +++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
> > @@ -109,6 +109,16 @@ __rte_experimental
> >  void
> >  rte_mcfg_timer_unlock(void);
> >
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Get the single_file_segments parameter value from memory configuration.

I would prefer you describe what this actually means.
We don't really care about the value itself.


> > + */
> > +__rte_experimental
> > +uint32_t

And a boolean is enough, this is a flag.


> > +rte_mcfg_get_single_file_segments(void);
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> > index 7cbf82d37..c2b9d473f 100644
> > --- a/lib/librte_eal/rte_eal_version.map
> > +++ b/lib/librte_eal/rte_eal_version.map
> > @@ -418,5 +418,6 @@ EXPERIMENTAL {
> >       rte_lcore_to_cpu_id;
> >       rte_mcfg_timer_lock;
> >       rte_mcfg_timer_unlock;
> > +     rte_mcfg_get_single_file_segments;
>
> This should be moved to 19.11 block in experimental

+1


> cc'ed Dave for eal part,
> @Dave, change looks straight forward but can you please check/comment?

I don't like the name of this API, since it gives the impression it
returns "segments"..
But on the other hand, this is aligned with the mcfg field: people
touching the internals have more chances to see there is an exported
API.

Cc: Anatoly (but I think he is off for this week).

Other than that I am ok with this change.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-29 14:28     ` David Marchand
@ 2019-10-30 10:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
  2019-10-30 10:25         ` David Marchand
  0 siblings, 1 reply; 20+ messages in thread
From: Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco) @ 2019-10-30 10:17 UTC (permalink / raw)
  To: David Marchand, Yigit, Ferruh; +Cc: dev, Burakov, Anatoly



> > On 8/22/2019 9:18 AM, Jakub Grajciar wrote:
> > > Zero-copy slave support for memif PMD.
> > > Slave interface exposes DPDK memory to master interface. Only single
> > > file segments are supported (EAL option --single-file-segments).
> 
> Do you really want this additional configuration in your driver or can't you
> enable/disable the functional

	Performance with multi file segments is worse than non-zero copy, so there is no reason to implement.

> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Get the single_file_segments parameter value from memory
> configuration.
> 
> I would prefer you describe what this actually means.
> We don't really care about the value itself.

	Ack.

> 
> 
> > > + */
> > > +__rte_experimental
> > > +uint32_t
> 
> And a boolean is enough, this is a flag.

	Ack.

> 
> 
> > > +rte_mcfg_get_single_file_segments(void);
> > > +
> > >  #ifdef __cplusplus
> > >  }
> > >  #endif
> > > diff --git a/lib/librte_eal/rte_eal_version.map
> > > b/lib/librte_eal/rte_eal_version.map
> > > index 7cbf82d37..c2b9d473f 100644
> > > --- a/lib/librte_eal/rte_eal_version.map
> > > +++ b/lib/librte_eal/rte_eal_version.map
> > > @@ -418,5 +418,6 @@ EXPERIMENTAL {
> > >       rte_lcore_to_cpu_id;
> > >       rte_mcfg_timer_lock;
> > >       rte_mcfg_timer_unlock;
> > > +     rte_mcfg_get_single_file_segments;
> >
> > This should be moved to 19.11 block in experimental
> 
> +1

	Ack.

> 
> 
> > cc'ed Dave for eal part,
> > @Dave, change looks straight forward but can you please check/comment?
> 
> I don't like the name of this API, since it gives the impression it returns
> "segments"..
> But on the other hand, this is aligned with the mcfg field: people touching the
> internals have more chances to see there is an exported API.

	Do you have any suggestions? How about rte_mcfg_get_single_file_segments_parameter()?

> 
> Cc: Anatoly (but I think he is off for this week).
> 
> Other than that I am ok with this change.
> 
> 
> --
> David Marchand


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v5] net/memif: zero-copy slave
  2019-10-30 10:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
@ 2019-10-30 10:25         ` David Marchand
  0 siblings, 0 replies; 20+ messages in thread
From: David Marchand @ 2019-10-30 10:25 UTC (permalink / raw)
  To: Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
  Cc: Yigit, Ferruh, dev, Burakov, Anatoly

On Wed, Oct 30, 2019 at 11:17 AM Jakub Grajciar -X (jgrajcia -
PANTHEON TECH SRO at Cisco) <jgrajcia@cisco.com> wrote:
> > > On 8/22/2019 9:18 AM, Jakub Grajciar wrote:
> > > > Zero-copy slave support for memif PMD.
> > > > Slave interface exposes DPDK memory to master interface. Only single
> > > > file segments are supported (EAL option --single-file-segments).
> >
> > Do you really want this additional configuration in your driver or can't you
> > enable/disable the functional

Ah, I must have context-switched when writing this mail.. and forgot
to delete this part, sorry.
You can ignore.

>         Performance with multi file segments is worse than non-zero copy, so there is no reason to implement.


[snip]

> > I don't like the name of this API, since it gives the impression it returns
> > "segments"..
> > But on the other hand, this is aligned with the mcfg field: people touching the
> > internals have more chances to see there is an exported API.
>
>         Do you have any suggestions? How about rte_mcfg_get_single_file_segments_parameter()?

Nop, let's keep it as you proposed.
Thanks.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [dpdk-dev] [PATCH v6] net/memif: zero-copy slave
  2019-08-22  8:18 ` [dpdk-dev] [PATCH v5] " Jakub Grajciar
  2019-10-04 13:23   ` Ferruh Yigit
  2019-10-25 16:44   ` Yigit, Ferruh
@ 2019-11-04 11:03   ` Jakub Grajciar
  2019-11-11 15:21     ` Ferruh Yigit
  2 siblings, 1 reply; 20+ messages in thread
From: Jakub Grajciar @ 2019-11-04 11:03 UTC (permalink / raw)
  To: dev; +Cc: Jakub Grajciar

Zero-copy slave support for memif PMD.
Slave interface exposes DPDK memory to
master interface. Only single file segments
are supported (EAL option --single-file-segments).

Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
---
 doc/guides/nics/memif.rst                     |  42 +-
 drivers/net/memif/Makefile                    |   1 +
 drivers/net/memif/memif_socket.c              |  65 +--
 drivers/net/memif/meson.build                 |   1 +
 drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
 drivers/net/memif/rte_eth_memif.h             |  11 +-
 lib/librte_eal/common/eal_common_mcfg.c       |   7 +
 .../common/include/rte_eal_memconfig.h        |  13 +
 lib/librte_eal/rte_eal_version.map            |   1 +
 9 files changed, 516 insertions(+), 74 deletions(-)

V2:
- fix coding style

V3:
- fix compilation issues

V4:
- don't move existing code
- add new EAL API rte_mcfg_get_single_file_segments,
  mem_config is now private, this api returns
  single_file_segments parameter value

V5:
- explain single file segments limitation
- add zero-copy slave example

V6:
- rte_mcfg_get_single_file_segments: move to 19.11 experimental block,
  change retval to bool, update description

diff --git a/doc/guides/nics/memif.rst b/doc/guides/nics/memif.rst
index 9a568455e..4d7f0086a 100644
--- a/doc/guides/nics/memif.rst
+++ b/doc/guides/nics/memif.rst
@@ -45,7 +45,7 @@ client.
    "socket=/tmp/memif.sock", "Socket filename", "/tmp/memif.sock", "string len 108"
    "mac=01:23:45:ab:cd:ef", "Mac address", "01:ab:23:cd:45:ef", ""
    "secret=abc123", "Secret is an optional security option, which if specified, must be matched by peer", "", "string len 24"
-   "zero-copy=yes", "Enable/disable zero-copy slave mode", "no", "yes|no"
+   "zero-copy=yes", "Enable/disable zero-copy slave mode. Only relevant to slave, requires '--single-file-segments' eal argument", "no", "yes|no"

 **Connection establishment**

@@ -171,6 +171,42 @@ Files
 - net/memif/memif.h *- descriptor and ring definitions*
 - net/memif/rte_eth_memif.c *- eth_memif_rx() eth_memif_tx()*

+Zero-copy slave
+~~~~~~~~~~~~~~~
+
+Zero-copy slave can be enabled with memif configuration option 'zero-copy=yes'. This option
+is only relevant to slave and requires eal argument '--single-file-segments'.
+This limitation is in place, because it is too expensive to identify memseg
+for each packet buffer, resulting in worse performance than with zero-copy disabled.
+With single file segments we can calculate offset from the beginning of the file
+for each packet buffer.
+
+**Shared memory format**
+
+Region 0 is created by memif driver and contains rings. Slave interface exposes DPDK memory (memseg).
+Instead of using memfd_create() to create new shared file, existing memsegs are used.
+Master interface functions the same as with zero-copy disabled.
+
+region 0:
+
++-----------------------+
+| Rings                 |
++-----------+-----------+
+| S2M rings | M2S rings |
++-----------+-----------+
+
+region n:
+
++-----------------+
+| Buffers         |
++-----------------+
+|memseg           |
++-----------------+
+
+Buffers are dequeued and enqueued as needed. Offset descriptor field is calculated at tx.
+Only single file segments mode (EAL option --single-file-segments) is supported, as calculating
+offset from multiple segments is too expensive.
+
 Example: testpmd
 ----------------------------
 In this example we run two instances of testpmd application and transmit packets over memif.
@@ -183,6 +219,10 @@ Now create ``slave`` interface (master must be already running so the slave will

     #./build/app/testpmd -l 2-3 --proc-type=primary --file-prefix=pmd2 --vdev=net_memif -- -i

+You can also enable ``zero-copy`` on ``slave`` interface::
+
+    #./build/app/testpmd -l 2-3 --proc-type=primary --file-prefix=pmd2 --vdev=net_memif,zero-copy=yes --single-file-segments -- -i
+
 Start forwarding packets::

     Slave:
diff --git a/drivers/net/memif/Makefile b/drivers/net/memif/Makefile
index 3d92b08f2..b50fc1cdb 100644
--- a/drivers/net/memif/Makefile
+++ b/drivers/net/memif/Makefile
@@ -20,6 +20,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool
 LDLIBS += -lrte_ethdev -lrte_kvargs -lrte_net
 LDLIBS += -lrte_hash
diff --git a/drivers/net/memif/memif_socket.c b/drivers/net/memif/memif_socket.c
index 4efa68e1a..ad5e30b96 100644
--- a/drivers/net/memif/memif_socket.c
+++ b/drivers/net/memif/memif_socket.c
@@ -175,8 +175,7 @@ memif_msg_receive_hello(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_name, (char *)h->name, sizeof(pmd->remote_name));

-	MIF_LOG(DEBUG, "%s: Connecting to %s.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_name);
+	MIF_LOG(DEBUG, "Connecting to %s.", pmd->remote_name);

 	return 0;
 }
@@ -338,8 +337,7 @@ memif_msg_receive_connect(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -357,8 +355,7 @@ memif_msg_receive_connected(struct rte_eth_dev *dev, memif_msg_t *msg)

 	strlcpy(pmd->remote_if_name, (char *)c->if_name,
 		sizeof(pmd->remote_if_name));
-	MIF_LOG(INFO, "%s: Remote interface %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_if_name);
+	MIF_LOG(INFO, "Remote interface %s connected.", pmd->remote_if_name);

 	return 0;
 }
@@ -369,14 +366,13 @@ memif_msg_receive_disconnect(struct rte_eth_dev *dev, memif_msg_t *msg)
 	struct pmd_internals *pmd = dev->data->dev_private;
 	memif_msg_disconnect_t *d = &msg->disconnect;

-	memset(pmd->remote_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->remote_disc_string, 0, sizeof(pmd->remote_disc_string));
 	strlcpy(pmd->remote_disc_string, (char *)d->string,
 		sizeof(pmd->remote_disc_string));

-	MIF_LOG(INFO, "%s: Disconnect received: %s",
-		rte_vdev_device_name(pmd->vdev), pmd->remote_disc_string);
+	MIF_LOG(INFO, "Disconnect received: %s", pmd->remote_disc_string);

-	memset(pmd->local_disc_string, 0, ETH_MEMIF_DISC_STRING_SIZE);
+	memset(pmd->local_disc_string, 0, 96);
 	memif_disconnect(dev);
 	return 0;
 }
@@ -472,7 +468,6 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connect_t *c;

 	if (e == NULL)
@@ -480,7 +475,7 @@ memif_msg_enq_connect(struct rte_eth_dev *dev)

 	c = &e->msg.connect;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECT;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -490,7 +485,6 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct memif_msg_queue_elt *e = memif_msg_enq(pmd->cc);
-	const char *name = rte_vdev_device_name(pmd->vdev);
 	memif_msg_connected_t *c;

 	if (e == NULL)
@@ -498,7 +492,7 @@ memif_msg_enq_connected(struct rte_eth_dev *dev)

 	c = &e->msg.connected;
 	e->msg.type = MEMIF_MSG_TYPE_CONNECTED;
-	strlcpy((char *)c->if_name, name, sizeof(c->if_name));
+	strlcpy((char *)c->if_name, dev->data->name, sizeof(c->if_name));

 	return 0;
 }
@@ -524,7 +518,6 @@ void
 memif_disconnect(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
-	struct pmd_process_private *proc_private = dev->process_private;
 	struct memif_msg_queue_elt *elt, *next;
 	struct memif_queue *mq;
 	struct rte_intr_handle *ih;
@@ -614,7 +607,7 @@ memif_disconnect(struct rte_eth_dev *dev)
 		}
 	}

-	memif_free_regions(proc_private);
+	memif_free_regions(dev);

 	/* reset connection configuration */
 	memset(&pmd->run, 0, sizeof(pmd->run));
@@ -661,7 +654,7 @@ memif_msg_receive(struct memif_control_channel *cc)
 			if (cmsg->cmsg_type == SCM_CREDENTIALS)
 				cr = (struct ucred *)CMSG_DATA(cmsg);
 			else if (cmsg->cmsg_type == SCM_RIGHTS)
-				memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
+				rte_memcpy(&afd, CMSG_DATA(cmsg), sizeof(int));
 		}
 		cmsg = CMSG_NXTHDR(&mh, cmsg);
 	}
@@ -860,8 +853,7 @@ memif_listener_handler(void *arg)
 }

 static struct memif_socket *
-memif_socket_create(struct pmd_internals *pmd,
-		    const char *key, uint8_t listener)
+memif_socket_create(char *key, uint8_t listener)
 {
 	struct memif_socket *sock;
 	struct sockaddr_un un;
@@ -900,17 +892,15 @@ memif_socket_create(struct pmd_internals *pmd,
 		if (ret < 0)
 			goto error;

-		MIF_LOG(DEBUG, "%s: Memif listener socket %s created.",
-			rte_vdev_device_name(pmd->vdev), sock->filename);
+		MIF_LOG(DEBUG, "Memif listener socket %s created.", sock->filename);

 		sock->intr_handle.fd = sockfd;
 		sock->intr_handle.type = RTE_INTR_HANDLE_EXT;
 		ret = rte_intr_callback_register(&sock->intr_handle,
 						 memif_listener_handler, sock);
 		if (ret < 0) {
-			MIF_LOG(ERR, "%s: Failed to register interrupt "
-				"callback for listener socket",
-				rte_vdev_device_name(pmd->vdev));
+			MIF_LOG(ERR, "Failed to register interrupt "
+				"callback for listener socket");
 			return NULL;
 		}
 	}
@@ -918,9 +908,7 @@ memif_socket_create(struct pmd_internals *pmd,
 	return sock;

  error:
-	MIF_LOG(ERR, "%s: Failed to setup socket %s: %s",
-		rte_vdev_device_name(pmd->vdev) ?
-		rte_vdev_device_name(pmd->vdev) : "NULL", key, strerror(errno));
+	MIF_LOG(ERR, "Failed to setup socket %s: %s", key, strerror(errno));
 	if (sock != NULL)
 		rte_free(sock);
 	if (sockfd >= 0)
@@ -965,9 +953,8 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)
 	strlcpy(key, socket_filename, MEMIF_SOCKET_UN_SIZE);
 	ret = rte_hash_lookup_data(hash, key, (void **)&socket);
 	if (ret < 0) {
-		socket = memif_socket_create(pmd, key,
-					     (pmd->role ==
-					      MEMIF_ROLE_SLAVE) ? 0 : 1);
+		socket = memif_socket_create(key,
+					     (pmd->role == MEMIF_ROLE_SLAVE) ? 0 : 1);
 		if (socket == NULL)
 			return -1;
 		ret = rte_hash_add_key_data(hash, key, socket);
@@ -998,8 +985,7 @@ memif_socket_init(struct rte_eth_dev *dev, const char *socket_filename)

 	elt = rte_malloc("pmd-queue", sizeof(struct memif_socket_dev_list_elt), 0);
 	if (elt == NULL) {
-		MIF_LOG(ERR, "%s: Failed to add device to socket device list.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to add device to socket device list.");
 		return -1;
 	}
 	elt->dev = dev;
@@ -1077,8 +1063,7 @@ memif_connect_slave(struct rte_eth_dev *dev)

 	sockfd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
 	if (sockfd < 0) {
-		MIF_LOG(ERR, "%s: Failed to open socket.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to open socket.");
 		return -1;
 	}

@@ -1089,19 +1074,16 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = connect(sockfd, (struct sockaddr *)&sun,
 		      sizeof(struct sockaddr_un));
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to connect socket: %s.",
-			rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+		MIF_LOG(ERR, "Failed to connect socket: %s.", pmd->socket_filename);
 		goto error;
 	}

-	MIF_LOG(DEBUG, "%s: Memif socket: %s connected.",
-		rte_vdev_device_name(pmd->vdev), pmd->socket_filename);
+	MIF_LOG(DEBUG, "Memif socket: %s connected.", pmd->socket_filename);

 	pmd->cc = rte_zmalloc("memif-cc",
 			      sizeof(struct memif_control_channel), 0);
 	if (pmd->cc == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate control channel.",
-			rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to allocate control channel.");
 		goto error;
 	}

@@ -1114,8 +1096,7 @@ memif_connect_slave(struct rte_eth_dev *dev)
 	ret = rte_intr_callback_register(&pmd->cc->intr_handle,
 					 memif_intr_handler, pmd->cc);
 	if (ret < 0) {
-		MIF_LOG(ERR, "%s: Failed to register interrupt callback "
-			"for control fd", rte_vdev_device_name(pmd->vdev));
+		MIF_LOG(ERR, "Failed to register interrupt callback for control fd");
 		goto error;
 	}

diff --git a/drivers/net/memif/meson.build b/drivers/net/memif/meson.build
index a44d82535..4c1c647f6 100644
--- a/drivers/net/memif/meson.build
+++ b/drivers/net/memif/meson.build
@@ -15,5 +15,6 @@ allow_experimental_apis = true
 # - rte_mp_action_register
 # - rte_mp_reply
 # - rte_mp_request_sync
+# - rte_mcfg_get_single_file_segments

 deps += ['hash']
diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index 35934c834..8dd1d0d63 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -23,6 +23,10 @@
 #include <rte_kvargs.h>
 #include <rte_bus_vdev.h>
 #include <rte_string_fns.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_eal_memconfig.h>

 #include "rte_eth_memif.h"
 #include "memif_socket.h"
@@ -50,6 +54,10 @@ static const char * const valid_arguments[] = {

 #define MEMIF_MP_SEND_REGION		"memif_mp_send_region"

+
+static int memif_region_init_zc(const struct rte_memseg_list *msl,
+				const struct rte_memseg *ms, void *arg);
+
 const char *
 memif_version(void)
 {
@@ -116,10 +124,14 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 	struct mp_region_msg *reply_param;
 	struct memif_region *r;
 	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
+	/* in case of zero-copy slave, only request region 0 */
+	uint16_t max_region_num = (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) ?
+				   1 : ETH_MEMIF_MAX_REGION_NUM;

 	MIF_LOG(DEBUG, "Requesting memory regions");

-	for (i = 0; i < ETH_MEMIF_MAX_REGION_NUM; i++) {
+	for (i = 0; i < max_region_num; i++) {
 		/* Prepare the message */
 		memset(&msg, 0, sizeof(msg));
 		strlcpy(msg.name, MEMIF_MP_SEND_REGION, sizeof(msg.name));
@@ -161,6 +173,12 @@ memif_mp_request_regions(struct rte_eth_dev *dev)
 		free(reply);
 	}

+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)proc_private);
+		if (ret < 0)
+			return ret;
+	}
+
 	return memif_connect(dev);
 }

@@ -220,6 +238,23 @@ memif_get_buffer(struct pmd_process_private *proc_private, memif_desc_t *d)
 	return ((uint8_t *)proc_private->regions[d->region]->addr + d->offset);
 }

+/* Free mbufs received by master */
+static void
+memif_free_stored_mbufs(struct pmd_process_private *proc_private, struct memif_queue *mq)
+{
+	uint16_t mask = (1 << mq->log2_ring_size) - 1;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+
+	/* FIXME: improve performance */
+	while (mq->last_tail != ring->tail) {
+		RTE_MBUF_PREFETCH_TO_FREE(mq->buffers[(mq->last_tail + 1) & mask]);
+		/* Decrement refcnt and free mbuf. (current segment) */
+		rte_mbuf_refcnt_update(mq->buffers[mq->last_tail & mask], -1);
+		rte_pktmbuf_free_seg(mq->buffers[mq->last_tail & mask]);
+		mq->last_tail++;
+	}
+}
+
 static int
 memif_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *cur_tail,
 		    struct rte_mbuf *tail)
@@ -334,8 +369,8 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;

 			memcpy(rte_pktmbuf_mtod_offset(mbuf, void *, dst_off),
-			       (uint8_t *)memif_get_buffer(proc_private, d0) +
-			       src_off, cp_len);
+			       (uint8_t *)memif_get_buffer(proc_private, d0) + src_off,
+			       cp_len);

 			src_off += cp_len;
 			dst_off += cp_len;
@@ -378,6 +413,120 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_rx_pkts;
 }

+static uint16_t
+eth_memif_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t cur_slot, last_slot, n_slots, ring_size, mask, s0, head;
+	uint16_t n_rx_pkts = 0;
+	memif_desc_t *d0;
+	struct rte_mbuf *mbuf, *mbuf_tail;
+	struct rte_mbuf *mbuf_head = NULL;
+	int ret;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	/* consume interrupt */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t b;
+		ssize_t size __rte_unused;
+		size = read(mq->intr_handle.fd, &b, sizeof(b));
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	cur_slot = mq->last_tail;
+	last_slot = ring->tail;
+	if (cur_slot == last_slot)
+		goto refill;
+	n_slots = last_slot - cur_slot;
+
+	while (n_slots && n_rx_pkts < nb_pkts) {
+		s0 = cur_slot & mask;
+
+		d0 = &ring->desc[s0];
+		mbuf_head = mq->buffers[s0];
+		mbuf = mbuf_head;
+
+next_slot:
+		/* prefetch next descriptor */
+		if (n_rx_pkts + 1 < nb_pkts)
+			rte_prefetch0(&ring->desc[(cur_slot + 1) & mask]);
+
+		mbuf->port = mq->in_port;
+		rte_pktmbuf_data_len(mbuf) = d0->length;
+		rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+
+		mq->n_bytes += rte_pktmbuf_data_len(mbuf);
+
+		cur_slot++;
+		n_slots--;
+		if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
+			mbuf_tail = mbuf;
+			mbuf = mq->buffers[s0];
+			ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+			if (unlikely(ret < 0)) {
+				MIF_LOG(ERR, "number-of-segments-overflow");
+				goto refill;
+			}
+			goto next_slot;
+		}
+
+		*bufs++ = mbuf_head;
+		n_rx_pkts++;
+	}
+
+	mq->last_tail = cur_slot;
+
+/* Supply master with new buffers */
+refill:
+	head = ring->head;
+	n_slots = ring_size - head + mq->last_tail;
+
+	if (n_slots < 32)
+		goto no_free_mbufs;
+
+	ret = rte_pktmbuf_alloc_bulk(mq->mempool, &mq->buffers[head & mask], n_slots);
+	if (unlikely(ret < 0))
+		goto no_free_mbufs;
+
+	while (n_slots--) {
+		s0 = head++ & mask;
+		if (n_slots > 0)
+			rte_prefetch0(mq->buffers[head & mask]);
+		d0 = &ring->desc[s0];
+		/* store buffer header */
+		mbuf = mq->buffers[s0];
+		/* populate descriptor */
+		d0->length = rte_pktmbuf_data_room_size(mq->mempool) -
+				RTE_PKTMBUF_HEADROOM;
+		d0->region = 1;
+		d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+			(uint8_t *)proc_private->regions[d0->region]->addr;
+	}
+no_free_mbufs:
+	rte_mb();
+	ring->head = head;
+
+	mq->n_pkts += n_rx_pkts;
+
+	return n_rx_pkts;
+}
+
 static uint16_t
 eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
@@ -498,19 +647,173 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }

+
+static int
+memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
+		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,
+		uint16_t slot, uint16_t n_free)
+{
+	memif_desc_t *d0;
+	int used_slots = 1;
+
+next_in_chain:
+	/* store pointer to mbuf to free it later */
+	mq->buffers[slot & mask] = mbuf;
+	/* Increment refcnt to make sure the buffer is not freed before master
+	 * receives it. (current segment)
+	 */
+	rte_mbuf_refcnt_update(mbuf, 1);
+	/* populate descriptor */
+	d0 = &ring->desc[slot & mask];
+	d0->length = rte_pktmbuf_data_len(mbuf);
+	/* FIXME: get region index */
+	d0->region = 1;
+	d0->offset = rte_pktmbuf_mtod(mbuf, uint8_t *) -
+		(uint8_t *)proc_private->regions[d0->region]->addr;
+	d0->flags = 0;
+
+	/* check if buffer is chained */
+	if (rte_pktmbuf_is_contiguous(mbuf) == 0) {
+		if (n_free < 2)
+			return 0;
+		/* mark buffer as chained */
+		d0->flags |= MEMIF_DESC_FLAG_NEXT;
+		/* advance mbuf */
+		mbuf = mbuf->next;
+		/* update counters */
+		used_slots++;
+		slot++;
+		n_free--;
+		goto next_in_chain;
+	}
+	return used_slots;
+}
+
+static uint16_t
+eth_memif_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct memif_queue *mq = queue;
+	struct pmd_internals *pmd = rte_eth_devices[mq->in_port].data->dev_private;
+	struct pmd_process_private *proc_private =
+		rte_eth_devices[mq->in_port].process_private;
+	memif_ring_t *ring = memif_get_ring_from_queue(proc_private, mq);
+	uint16_t slot, n_free, ring_size, mask, n_tx_pkts = 0;
+	memif_ring_type_t type = mq->type;
+	struct rte_eth_link link;
+
+	if (unlikely((pmd->flags & ETH_MEMIF_FLAG_CONNECTED) == 0))
+		return 0;
+	if (unlikely(ring == NULL)) {
+		/* Secondary process will attempt to request regions. */
+		rte_eth_link_get(mq->in_port, &link);
+		return 0;
+	}
+
+	ring_size = 1 << mq->log2_ring_size;
+	mask = ring_size - 1;
+
+	/* free mbufs received by master */
+	memif_free_stored_mbufs(proc_private, mq);
+
+	/* ring type always MEMIF_RING_S2M */
+	slot = ring->head;
+	n_free = ring_size - ring->head + mq->last_tail;
+
+	int used_slots;
+
+	while (n_free && (n_tx_pkts < nb_pkts)) {
+		while ((n_free > 4) && ((nb_pkts - n_tx_pkts) > 4)) {
+			if ((nb_pkts - n_tx_pkts) > 8) {
+				rte_prefetch0(*bufs + 4);
+				rte_prefetch0(*bufs + 5);
+				rte_prefetch0(*bufs + 6);
+				rte_prefetch0(*bufs + 7);
+			}
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+
+			used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+				mask, slot, n_free);
+			if (unlikely(used_slots < 1))
+				goto no_free_slots;
+			n_tx_pkts++;
+			slot += used_slots;
+			n_free -= used_slots;
+		}
+		used_slots = memif_tx_one_zc(proc_private, mq, ring, *bufs++,
+			mask, slot, n_free);
+		if (unlikely(used_slots < 1))
+			goto no_free_slots;
+		n_tx_pkts++;
+		slot += used_slots;
+		n_free -= used_slots;
+	}
+
+no_free_slots:
+	rte_mb();
+	/* update ring pointers */
+	if (type == MEMIF_RING_S2M)
+		ring->head = slot;
+	else
+		ring->tail = slot;
+
+	/* Send interrupt, if enabled. */
+	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
+		uint64_t a = 1;
+		ssize_t size = write(mq->intr_handle.fd, &a, sizeof(a));
+		if (unlikely(size < 0)) {
+			MIF_LOG(WARNING,
+				"Failed to send interrupt. %s", strerror(errno));
+		}
+	}
+
+	/* increment queue counters */
+	mq->n_pkts += n_tx_pkts;
+
+	return n_tx_pkts;
+}
+
 void
-memif_free_regions(struct pmd_process_private *proc_private)
+memif_free_regions(struct rte_eth_dev *dev)
 {
+	struct pmd_process_private *proc_private = dev->process_private;
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int i;
 	struct memif_region *r;

-	MIF_LOG(DEBUG, "Free memory regions");
 	/* regions are allocated contiguously, so it's
 	 * enough to loop until 'proc_private->regions_num'
 	 */
 	for (i = 0; i < proc_private->regions_num; i++) {
 		r = proc_private->regions[i];
 		if (r != NULL) {
+			/* This is memzone */
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				r->addr = NULL;
+				if (r->fd > 0)
+					close(r->fd);
+			}
 			if (r->addr != NULL) {
 				munmap(r->addr, r->region_size);
 				if (r->fd > 0) {
@@ -525,6 +828,45 @@ memif_free_regions(struct pmd_process_private *proc_private)
 	proc_private->regions_num = 0;
 }

+static int
+memif_region_init_zc(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+		     void *arg)
+{
+	struct pmd_process_private *proc_private = (struct pmd_process_private *)arg;
+	struct memif_region *r;
+
+	if (proc_private->regions_num < 1) {
+		MIF_LOG(ERR, "Missing descriptor region");
+		return -1;
+	}
+
+	r = proc_private->regions[proc_private->regions_num - 1];
+
+	if (r->addr != msl->base_va)
+		r = proc_private->regions[++proc_private->regions_num - 1];
+
+	if (r == NULL) {
+		r = rte_zmalloc("region", sizeof(struct memif_region), 0);
+		if (r == NULL) {
+			MIF_LOG(ERR, "Failed to alloc memif region.");
+			return -ENOMEM;
+		}
+
+		r->addr = msl->base_va;
+		r->region_size = ms->len;
+		r->fd = rte_memseg_get_fd(ms);
+		if (r->fd < 0)
+			return -1;
+		r->pkt_buffer_offset = 0;
+
+		proc_private->regions[proc_private->regions_num - 1] = r;
+	} else {
+		r->region_size += ms->len;
+	}
+
+	return 0;
+}
+
 static int
 memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 {
@@ -605,12 +947,29 @@ memif_region_init_shm(struct rte_eth_dev *dev, uint8_t has_buffers)
 static int
 memif_regions_init(struct rte_eth_dev *dev)
 {
+	struct pmd_internals *pmd = dev->data->dev_private;
 	int ret;

-	/* create one buffer region */
-	ret = memif_region_init_shm(dev, /* has buffer */ 1);
-	if (ret < 0)
-		return ret;
+	/*
+	 * Zero-copy exposes dpdk memory.
+	 * Each memseg list will be represented by memif region.
+	 * Zero-copy regions indexing: memseg list idx + 1,
+	 * as we already have region 0 reserved for descriptors.
+	 */
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		/* create region idx 0 containing descriptors */
+		ret = memif_region_init_shm(dev, 0);
+		if (ret < 0)
+			return ret;
+		ret = rte_memseg_walk(memif_region_init_zc, (void *)dev->process_private);
+		if (ret < 0)
+			return ret;
+	} else {
+		/* create one memory region contaning rings and buffers */
+		ret = memif_region_init_shm(dev, /* has buffers */ 1);
+		if (ret < 0)
+			return ret;
+	}

 	return 0;
 }
@@ -630,6 +989,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = i * (1 << pmd->run.log2_ring_size) + j;
 			ring->desc[j].region = 0;
@@ -646,6 +1009,10 @@ memif_init_rings(struct rte_eth_dev *dev)
 		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
+
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)
+			continue;
+
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
 			slot = (i + pmd->run.num_s2m_rings) *
 			    (1 << pmd->run.log2_ring_size) + j;
@@ -659,7 +1026,7 @@ memif_init_rings(struct rte_eth_dev *dev)
 }

 /* called only by slave */
-static void
+static int
 memif_init_queues(struct rte_eth_dev *dev)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
@@ -680,6 +1047,13 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for tx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}

 	for (i = 0; i < pmd->run.num_m2s_rings; i++) {
@@ -696,7 +1070,15 @@ memif_init_queues(struct rte_eth_dev *dev)
 				"Failed to create eventfd for rx queue %d: %s.", i,
 				strerror(errno));
 		}
+		mq->buffers = NULL;
+		if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+			mq->buffers = rte_zmalloc("bufs", sizeof(struct rte_mbuf *) *
+						  (1 << mq->log2_ring_size), 0);
+			if (mq->buffers == NULL)
+				return -ENOMEM;
+		}
 	}
+	return 0;
 }

 int
@@ -710,7 +1092,9 @@ memif_init_regions_and_queues(struct rte_eth_dev *dev)

 	memif_init_rings(dev);

-	memif_init_queues(dev);
+	ret = memif_init_queues(dev);
+	if (ret < 0)
+		return ret;

 	return 0;
 }
@@ -734,8 +1118,16 @@ memif_connect(struct rte_eth_dev *dev)
 				mr->addr = mmap(NULL, mr->region_size,
 						PROT_READ | PROT_WRITE,
 						MAP_SHARED, mr->fd, 0);
-				if (mr->addr == NULL)
+				if (mr->addr == MAP_FAILED) {
+					MIF_LOG(ERR, "mmap failed: %s\n",
+						strerror(errno));
 					return -1;
+				}
+			}
+			if (i > 0 && (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY)) {
+				/* close memseg file */
+				close(mr->fd);
+				mr->fd = -1;
 			}
 		}
 	}
@@ -796,8 +1188,7 @@ memif_dev_start(struct rte_eth_dev *dev)
 		ret = memif_connect_master(dev);
 		break;
 	default:
-		MIF_LOG(ERR, "%s: Unknown role: %d.",
-			rte_vdev_device_name(pmd->vdev), pmd->role);
+		MIF_LOG(ERR, "Unknown role: %d.", pmd->role);
 		ret = -1;
 		break;
 	}
@@ -862,8 +1253,7 @@ memif_tx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("tx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate tx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate tx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -892,8 +1282,7 @@ memif_rx_queue_setup(struct rte_eth_dev *dev,

 	mq = rte_zmalloc("rx-queue", sizeof(struct memif_queue), 0);
 	if (mq == NULL) {
-		MIF_LOG(ERR, "%s: Failed to allocate rx queue id: %u",
-			rte_vdev_device_name(pmd->vdev), qid);
+		MIF_LOG(ERR, "Failed to allocate rx queue id: %u", qid);
 		return -ENOMEM;
 	}

@@ -933,7 +1322,7 @@ memif_link_update(struct rte_eth_dev *dev,
 			memif_mp_request_regions(dev);
 		} else if (dev->data->dev_link.link_status == ETH_LINK_DOWN &&
 				proc_private->regions_num > 0) {
-			memif_free_regions(proc_private);
+			memif_free_regions(dev);
 		}
 	}
 	return 0;
@@ -1054,11 +1443,6 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	const unsigned int numa_node = vdev->device.numa_node;
 	const char *name = rte_vdev_device_name(vdev);

-	if (flags & ETH_MEMIF_FLAG_ZERO_COPY) {
-		MIF_LOG(ERR, "Zero-copy slave not supported.");
-		return -1;
-	}
-
 	eth_dev = rte_eth_vdev_allocate(vdev, sizeof(*pmd));
 	if (eth_dev == NULL) {
 		MIF_LOG(ERR, "%s: Unable to allocate device struct.", name);
@@ -1082,6 +1466,9 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,
 	pmd->flags = flags;
 	pmd->flags |= ETH_MEMIF_FLAG_DISABLED;
 	pmd->role = role;
+	/* Zero-copy flag irelevant to master. */
+	if (pmd->role == MEMIF_ROLE_MASTER)
+		pmd->flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;

 	ret = memif_socket_init(eth_dev, socket_filename);
 	if (ret < 0)
@@ -1105,8 +1492,14 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role,

 	eth_dev->dev_ops = &ops;
 	eth_dev->device = &vdev->device;
-	eth_dev->rx_pkt_burst = eth_memif_rx;
-	eth_dev->tx_pkt_burst = eth_memif_tx;
+	if (pmd->flags & ETH_MEMIF_FLAG_ZERO_COPY) {
+		eth_dev->rx_pkt_burst = eth_memif_rx_zc;
+		eth_dev->tx_pkt_burst = eth_memif_tx_zc;
+	} else {
+		eth_dev->rx_pkt_burst = eth_memif_rx;
+		eth_dev->tx_pkt_burst = eth_memif_tx;
+	}
+

 	eth_dev->data->dev_flags &= RTE_ETH_DEV_CLOSE_REMOVE;

@@ -1138,6 +1531,10 @@ memif_set_zc(const char *key __rte_unused, const char *value, void *extra_args)
 	uint32_t *flags = (uint32_t *)extra_args;

 	if (strstr(value, "yes") != NULL) {
+		if (!rte_mcfg_get_single_file_segments()) {
+			MIF_LOG(ERR, "Zero-copy doesn't support multi-file segments.");
+			return -ENOTSUP;
+		}
 		*flags |= ETH_MEMIF_FLAG_ZERO_COPY;
 	} else if (strstr(value, "no") != NULL) {
 		*flags &= ~ETH_MEMIF_FLAG_ZERO_COPY;
diff --git a/drivers/net/memif/rte_eth_memif.h b/drivers/net/memif/rte_eth_memif.h
index 8269212cb..0d2566392 100644
--- a/drivers/net/memif/rte_eth_memif.h
+++ b/drivers/net/memif/rte_eth_memif.h
@@ -63,12 +63,15 @@ struct memif_queue {
 	uint16_t last_head;			/**< last ring head */
 	uint16_t last_tail;			/**< last ring tail */

+	struct rte_mbuf **buffers;
+	/**< Stored mbufs. Used in zero-copy tx. Slave stores transmitted
+	 * mbufs to free them once master has received them.
+	 */
+
 	/* rx/tx info */
 	uint64_t n_pkts;			/**< number of rx/tx packets */
 	uint64_t n_bytes;			/**< number of rx/tx bytes */

-	memif_ring_t *ring;			/**< pointer to ring */
-
 	struct rte_intr_handle intr_handle;	/**< interrupt handle */

 	memif_log2_ring_size_t log2_ring_size;	/**< log2 of ring size */
@@ -115,8 +118,6 @@ struct pmd_internals {
 	/**< local disconnect reason */
 	char remote_disc_string[ETH_MEMIF_DISC_STRING_SIZE];
 	/**< remote disconnect reason */
-
-	struct rte_vdev_device *vdev;		/**< vdev handle */
 };

 struct pmd_process_private {
@@ -131,7 +132,7 @@ struct pmd_process_private {
  * @param proc_private
  *   device process private data
  */
-void memif_free_regions(struct pmd_process_private *proc_private);
+void memif_free_regions(struct rte_eth_dev *dev);

 /**
  * Finalize connection establishment process. Map shared memory file
diff --git a/lib/librte_eal/common/eal_common_mcfg.c b/lib/librte_eal/common/eal_common_mcfg.c
index 0cf9a625f..36a8ca8fc 100644
--- a/lib/librte_eal/common/eal_common_mcfg.c
+++ b/lib/librte_eal/common/eal_common_mcfg.c
@@ -162,3 +162,10 @@ rte_mcfg_timer_unlock(void)
 	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	rte_spinlock_unlock(&mcfg->tlock);
 }
+
+bool
+rte_mcfg_get_single_file_segments(void)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	return (bool)mcfg->single_file_segments;
+}
diff --git a/lib/librte_eal/common/include/rte_eal_memconfig.h b/lib/librte_eal/common/include/rte_eal_memconfig.h
index 34b0e44a0..dede2ee32 100644
--- a/lib/librte_eal/common/include/rte_eal_memconfig.h
+++ b/lib/librte_eal/common/include/rte_eal_memconfig.h
@@ -5,6 +5,8 @@
 #ifndef _RTE_EAL_MEMCONFIG_H_
 #define _RTE_EAL_MEMCONFIG_H_

+#include <stdbool.h>
+
 #include <rte_compat.h>

 /**
@@ -109,6 +111,17 @@ __rte_experimental
 void
 rte_mcfg_timer_unlock(void);

+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * If true, pages are put in single files (per memseg list),
+ * as opposed to creating a file per page.
+ */
+__rte_experimental
+bool
+rte_mcfg_get_single_file_segments(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 3478d3b85..202db849f 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -429,4 +429,5 @@ EXPERIMENTAL {

 	# added in 19.11
 	rte_log_get_stream;
+	rte_mcfg_get_single_file_segments;
 };
--
2.17.1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v6] net/memif: zero-copy slave
  2019-11-04 11:03   ` [dpdk-dev] [PATCH v6] " Jakub Grajciar
@ 2019-11-11 15:21     ` Ferruh Yigit
  2019-11-11 15:24       ` Thomas Monjalon
  2019-11-11 15:49       ` David Marchand
  0 siblings, 2 replies; 20+ messages in thread
From: Ferruh Yigit @ 2019-11-11 15:21 UTC (permalink / raw)
  To: David Marchand, Anatoly Burakov; +Cc: Jakub Grajciar, dev

On 11/4/2019 11:03 AM, Jakub Grajciar wrote:
> Zero-copy slave support for memif PMD.
> Slave interface exposes DPDK memory to
> master interface. Only single file segments
> are supported (EAL option --single-file-segments).
> 
> Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
> ---
>  doc/guides/nics/memif.rst                     |  42 +-
>  drivers/net/memif/Makefile                    |   1 +
>  drivers/net/memif/memif_socket.c              |  65 +--
>  drivers/net/memif/meson.build                 |   1 +
>  drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
>  drivers/net/memif/rte_eth_memif.h             |  11 +-
>  lib/librte_eal/common/eal_common_mcfg.c       |   7 +
>  .../common/include/rte_eal_memconfig.h        |  13 +
>  lib/librte_eal/rte_eal_version.map            |   1 +
>  9 files changed, 516 insertions(+), 74 deletions(-)

net/memif part looks good to me,

@David, @Anatoly, any concern on new eal API, is it good to go?

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v6] net/memif: zero-copy slave
  2019-11-11 15:21     ` Ferruh Yigit
@ 2019-11-11 15:24       ` Thomas Monjalon
  2019-11-12 12:55         ` Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
  2019-11-11 15:49       ` David Marchand
  1 sibling, 1 reply; 20+ messages in thread
From: Thomas Monjalon @ 2019-11-11 15:24 UTC (permalink / raw)
  To: Jakub Grajciar; +Cc: dev, Ferruh Yigit, David Marchand, Anatoly Burakov

11/11/2019 16:21, Ferruh Yigit:
> On 11/4/2019 11:03 AM, Jakub Grajciar wrote:
> > Zero-copy slave support for memif PMD.
> > Slave interface exposes DPDK memory to
> > master interface. Only single file segments
> > are supported (EAL option --single-file-segments).
> > 
> > Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
> > ---
> >  doc/guides/nics/memif.rst                     |  42 +-
> >  drivers/net/memif/Makefile                    |   1 +
> >  drivers/net/memif/memif_socket.c              |  65 +--
> >  drivers/net/memif/meson.build                 |   1 +
> >  drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
> >  drivers/net/memif/rte_eth_memif.h             |  11 +-
> >  lib/librte_eal/common/eal_common_mcfg.c       |   7 +
> >  .../common/include/rte_eal_memconfig.h        |  13 +
> >  lib/librte_eal/rte_eal_version.map            |   1 +
> >  9 files changed, 516 insertions(+), 74 deletions(-)
> 
> net/memif part looks good to me,
> 
> @David, @Anatoly, any concern on new eal API, is it good to go?

I didn't get into details of this PMD,
but requesting an EAL property looks strange to me.
Is memif using directly some EAL memory?
Or is it using memory from mempool?
If it is from mempool, then the property should be requested to mempool.
Reminder: mempool memory is not always from EAL.




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v6] net/memif: zero-copy slave
  2019-11-11 15:21     ` Ferruh Yigit
  2019-11-11 15:24       ` Thomas Monjalon
@ 2019-11-11 15:49       ` David Marchand
  2019-11-15 16:55         ` Ferruh Yigit
  1 sibling, 1 reply; 20+ messages in thread
From: David Marchand @ 2019-11-11 15:49 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: Anatoly Burakov, Jakub Grajciar, dev

On Mon, Nov 11, 2019 at 4:21 PM Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>
> On 11/4/2019 11:03 AM, Jakub Grajciar wrote:
> > Zero-copy slave support for memif PMD.
> > Slave interface exposes DPDK memory to
> > master interface. Only single file segments
> > are supported (EAL option --single-file-segments).
> >
> > Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
> > ---
> >  doc/guides/nics/memif.rst                     |  42 +-
> >  drivers/net/memif/Makefile                    |   1 +
> >  drivers/net/memif/memif_socket.c              |  65 +--
> >  drivers/net/memif/meson.build                 |   1 +
> >  drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
> >  drivers/net/memif/rte_eth_memif.h             |  11 +-
> >  lib/librte_eal/common/eal_common_mcfg.c       |   7 +
> >  .../common/include/rte_eal_memconfig.h        |  13 +
> >  lib/librte_eal/rte_eal_version.map            |   1 +
> >  9 files changed, 516 insertions(+), 74 deletions(-)
>
> net/memif part looks good to me,
>
> @David, @Anatoly, any concern on new eal API, is it good to go?

I am okay with exposing such an info from the mem config since it was
exposed before.
It will be experimental anyway.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v6] net/memif: zero-copy slave
  2019-11-11 15:24       ` Thomas Monjalon
@ 2019-11-12 12:55         ` Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
  0 siblings, 0 replies; 20+ messages in thread
From: Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco) @ 2019-11-12 12:55 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, Ferruh Yigit, David Marchand, Anatoly Burakov



> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Monday, November 11, 2019 4:25 PM
> To: Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
> <jgrajcia@cisco.com>
> Cc: dev@dpdk.org; Ferruh Yigit <ferruh.yigit@intel.com>; David Marchand
> <david.marchand@redhat.com>; Anatoly Burakov
> <anatoly.burakov@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v6] net/memif: zero-copy slave
>
> I didn't get into details of this PMD,
> but requesting an EAL property looks strange to me.
> Is memif using directly some EAL memory?
> Or is it using memory from mempool?
> If it is from mempool, then the property should be requested to mempool.
> Reminder: mempool memory is not always from EAL.

Memif driver exposes memsegs to provide zero-copy. Each memseg list is considered one memory region by the pmd.
See net/memif/rte_eth_memif.c:memif_region_init_zc()


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [dpdk-dev] [PATCH v6] net/memif: zero-copy slave
  2019-11-11 15:49       ` David Marchand
@ 2019-11-15 16:55         ` Ferruh Yigit
  0 siblings, 0 replies; 20+ messages in thread
From: Ferruh Yigit @ 2019-11-15 16:55 UTC (permalink / raw)
  To: David Marchand; +Cc: Anatoly Burakov, Jakub Grajciar, dev

On 11/11/2019 3:49 PM, David Marchand wrote:
> On Mon, Nov 11, 2019 at 4:21 PM Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>>
>> On 11/4/2019 11:03 AM, Jakub Grajciar wrote:
>>> Zero-copy slave support for memif PMD.
>>> Slave interface exposes DPDK memory to
>>> master interface. Only single file segments
>>> are supported (EAL option --single-file-segments).
>>>
>>> Signed-off-by: Jakub Grajciar <jgrajcia@cisco.com>
>>> ---
>>>  doc/guides/nics/memif.rst                     |  42 +-
>>>  drivers/net/memif/Makefile                    |   1 +
>>>  drivers/net/memif/memif_socket.c              |  65 +--
>>>  drivers/net/memif/meson.build                 |   1 +
>>>  drivers/net/memif/rte_eth_memif.c             | 449 +++++++++++++++++-
>>>  drivers/net/memif/rte_eth_memif.h             |  11 +-
>>>  lib/librte_eal/common/eal_common_mcfg.c       |   7 +
>>>  .../common/include/rte_eal_memconfig.h        |  13 +
>>>  lib/librte_eal/rte_eal_version.map            |   1 +
>>>  9 files changed, 516 insertions(+), 74 deletions(-)
>>
>> net/memif part looks good to me,
>>
>> @David, @Anatoly, any concern on new eal API, is it good to go?
> 
> I am okay with exposing such an info from the mem config since it was
> exposed before.
> It will be experimental anyway.
> 

Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

Applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2019-11-15 16:55 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-09  8:22 [dpdk-dev] [PATCH v4] net/memif: zero-copy slave Jakub Grajciar
2019-07-10 15:06 ` Ferruh Yigit
2019-07-23 12:35   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
2019-08-22  8:18 ` [dpdk-dev] [PATCH v5] " Jakub Grajciar
2019-10-04 13:23   ` Ferruh Yigit
2019-10-15 16:59     ` Ferruh Yigit
2019-10-17 11:52       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
2019-10-17 16:04         ` Ferruh Yigit
2019-10-17 16:45           ` Ferruh Yigit
2019-10-21 16:07             ` Ferruh Yigit
2019-10-25 16:44   ` Yigit, Ferruh
2019-10-29 14:28     ` David Marchand
2019-10-30 10:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
2019-10-30 10:25         ` David Marchand
2019-11-04 11:03   ` [dpdk-dev] [PATCH v6] " Jakub Grajciar
2019-11-11 15:21     ` Ferruh Yigit
2019-11-11 15:24       ` Thomas Monjalon
2019-11-12 12:55         ` Jakub Grajciar -X (jgrajcia - PANTHEON TECH SRO at Cisco)
2019-11-11 15:49       ` David Marchand
2019-11-15 16:55         ` Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).