DPDK patches and discussions
 help / color / Atom feed
From: Matan Azrad <matan@mellanox.com>
To: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: dev@dpdk.org, Viacheslav Ovsiienko <viacheslavo@mellanox.com>
Subject: [dpdk-dev] [PATCH v2 1/3] vdpa/mlx5: optimize notification events
Date: Thu, 25 Jun 2020 13:30:36 +0000
Message-ID: <1593091838-51869-2-git-send-email-matan@mellanox.com> (raw)
In-Reply-To: <1593091838-51869-1-git-send-email-matan@mellanox.com>

When the virtio guest driver doesn't work with poll mode, the driver
creates event mechanism in order to schedule completion notifications
for each virtq burst traffic.

When traffic comes to a virtq, a CQE will be added to the virtq CQ by
the FW.
The driver requests interrupt for the next CQE index, and when interrupt
is triggered, the driver polls the CQ and notifies the guest by virtq
callfd writing.

According to the described method, the interrupts will be triggered for
each burst of trrafic. The burst size depends on interrupt latancy.

Interrupts management takes a lot of CPU cycles and using it for each
traffic burst takes big portion of CPU capacity.

When traffic is on, using timer for CQ poll scheduling instead of
interrupts saves a lot of CPU cycles.

Move CQ poll scheduling to be done by timer in case of running traffic.
Request interrupts only when traffic is off.

The timer scheduling management is done by a new dedicated thread uses
a usleep command.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/vdpa/mlx5/Makefile          |   1 +
 drivers/vdpa/mlx5/mlx5_vdpa.h       |   7 ++
 drivers/vdpa/mlx5/mlx5_vdpa_event.c | 175 ++++++++++++++++++++++++++++++------
 3 files changed, 157 insertions(+), 26 deletions(-)

diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
index 91c89d6..8a1c2ea 100644
--- a/drivers/vdpa/mlx5/Makefile
+++ b/drivers/vdpa/mlx5/Makefile
@@ -31,6 +31,7 @@ CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -Wno-strict-prototypes
 LDLIBS += -lrte_common_mlx5
 LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_pci -lrte_bus_pci -lrte_sched
+LDLIBS += -pthread
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
index 80b4c4b..ae1dcd8 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -39,6 +39,7 @@ struct mlx5_vdpa_cq {
 	uint16_t log_desc_n;
 	uint32_t cq_ci:24;
 	uint32_t arm_sn:2;
+	uint32_t armed:1;
 	int callfd;
 	rte_spinlock_t sl;
 	struct mlx5_devx_obj *cq;
@@ -103,6 +104,12 @@ struct mlx5_vdpa_priv {
 	TAILQ_ENTRY(mlx5_vdpa_priv) next;
 	uint8_t configured;
 	uint8_t direct_notifier; /* Whether direct notifier is on or off. */
+	uint64_t last_traffic_tic;
+	pthread_t timer_tid;
+	pthread_mutex_t timer_lock;
+	pthread_cond_t timer_cond;
+	volatile uint8_t timer_on;
+	uint32_t timer_delay_us;
 	int id; /* vDPA device id. */
 	int vid; /* vhost device id. */
 	struct ibv_context *ctx; /* Device context. */
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
index dd60150..69c8bf6 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -12,6 +12,7 @@
 #include <rte_atomic.h>
 #include <rte_common.h>
 #include <rte_io.h>
+#include <rte_alarm.h>
 
 #include <mlx5_common.h>
 
@@ -19,6 +20,9 @@
 #include "mlx5_vdpa.h"
 
 
+#define MLX5_VDPA_DEFAULT_TIMER_DELAY_US 500u
+#define MLX5_VDPA_NO_TRAFFIC_TIME_S 2LLU
+
 void
 mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
 {
@@ -26,10 +30,23 @@
 		mlx5_glue->devx_free_uar(priv->uar);
 		priv->uar = NULL;
 	}
+#ifdef HAVE_IBV_DEVX_EVENT
 	if (priv->eventc) {
+		union {
+			struct mlx5dv_devx_async_event_hdr event_resp;
+			uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr)
+									 + 128];
+		} out;
+
+		/* Clean all pending events. */
+		while (mlx5_glue->devx_get_event(priv->eventc, &out.event_resp,
+		       sizeof(out.buf)) >=
+		       (ssize_t)sizeof(out.event_resp.cookie))
+			;
 		mlx5_glue->devx_destroy_event_channel(priv->eventc);
 		priv->eventc = NULL;
 	}
+#endif
 	priv->eqn = 0;
 }
 
@@ -79,7 +96,7 @@
 	memset(cq, 0, sizeof(*cq));
 }
 
-static inline void
+static inline void __rte_unused
 mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq *cq)
 {
 	uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET;
@@ -100,6 +117,7 @@
 	*((uint32_t *)addr + 1) = db_be >> 32;
 #endif
 	cq->arm_sn++;
+	cq->armed = 1;
 }
 
 static int
@@ -157,6 +175,16 @@
 		rte_errno = errno;
 		goto error;
 	}
+	if (callfd != -1) {
+		ret = mlx5_glue->devx_subscribe_devx_event_fd(priv->eventc,
+							      callfd,
+							      cq->cq->obj, 0);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to subscribe CQE event fd.");
+			rte_errno = errno;
+			goto error;
+		}
+	}
 	cq->callfd = callfd;
 	/* Init CQ to ones to be in HW owner in the start. */
 	memset((void *)(uintptr_t)cq->umem_buf, 0xFF, attr.db_umem_offset);
@@ -168,27 +196,27 @@
 	return -1;
 }
 
-static inline void __rte_unused
-mlx5_vdpa_cq_poll(struct mlx5_vdpa_priv *priv __rte_unused,
-		  struct mlx5_vdpa_cq *cq)
+static inline uint32_t
+mlx5_vdpa_cq_poll(struct mlx5_vdpa_cq *cq)
 {
 	struct mlx5_vdpa_event_qp *eqp =
 				container_of(cq, struct mlx5_vdpa_event_qp, cq);
 	const unsigned int cq_size = 1 << cq->log_desc_n;
 	const unsigned int cq_mask = cq_size - 1;
+	uint32_t total = 0;
 	int ret;
 
 	do {
-		volatile struct mlx5_cqe *cqe = cq->cqes + (cq->cq_ci &
-							    cq_mask);
+		volatile struct mlx5_cqe *cqe = cq->cqes + ((cq->cq_ci + total)
+							    & cq_mask);
 
-		ret = check_cqe(cqe, cq_size, cq->cq_ci);
+		ret = check_cqe(cqe, cq_size, cq->cq_ci + total);
 		switch (ret) {
 		case MLX5_CQE_STATUS_ERR:
 			cq->errors++;
 			/*fall-through*/
 		case MLX5_CQE_STATUS_SW_OWN:
-			cq->cq_ci++;
+			total++;
 			break;
 		case MLX5_CQE_STATUS_HW_OWN:
 		default:
@@ -196,21 +224,86 @@
 		}
 	} while (ret != MLX5_CQE_STATUS_HW_OWN);
 	rte_io_wmb();
+	cq->cq_ci += total;
 	/* Ring CQ doorbell record. */
 	cq->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
 	rte_io_wmb();
 	/* Ring SW QP doorbell record. */
 	eqp->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci + cq_size);
+	return total;
+}
+
+static void
+mlx5_vdpa_arm_all_cqs(struct mlx5_vdpa_priv *priv)
+{
+	struct mlx5_vdpa_cq *cq;
+	int i;
+
+	for (i = 0; i < priv->nr_virtqs; i++) {
+		cq = &priv->virtqs[i].eqp.cq;
+		if (cq->cq && !cq->armed)
+			mlx5_vdpa_cq_arm(priv, cq);
+	}
+}
+
+static void *
+mlx5_vdpa_poll_handle(void *arg)
+{
+	struct mlx5_vdpa_priv *priv = arg;
+	int i;
+	struct mlx5_vdpa_cq *cq;
+	uint32_t total;
+	uint64_t current_tic;
+
+	pthread_mutex_lock(&priv->timer_lock);
+	while (!priv->timer_on)
+		pthread_cond_wait(&priv->timer_cond, &priv->timer_lock);
+	pthread_mutex_unlock(&priv->timer_lock);
+	while (1) {
+		total = 0;
+		for (i = 0; i < priv->nr_virtqs; i++) {
+			cq = &priv->virtqs[i].eqp.cq;
+			if (cq->cq && !cq->armed) {
+				uint32_t comp = mlx5_vdpa_cq_poll(cq);
+
+				if (comp) {
+					/* Notify guest for descs consuming. */
+					if (cq->callfd != -1)
+						eventfd_write(cq->callfd,
+							      (eventfd_t)1);
+					total += comp;
+				}
+			}
+		}
+		current_tic = rte_rdtsc();
+		if (!total) {
+			/* No traffic ? stop timer and load interrupts. */
+			if (current_tic - priv->last_traffic_tic >=
+			    rte_get_timer_hz() * MLX5_VDPA_NO_TRAFFIC_TIME_S) {
+				DRV_LOG(DEBUG, "Device %d traffic was stopped.",
+					priv->id);
+				mlx5_vdpa_arm_all_cqs(priv);
+				pthread_mutex_lock(&priv->timer_lock);
+				priv->timer_on = 0;
+				while (!priv->timer_on)
+					pthread_cond_wait(&priv->timer_cond,
+							  &priv->timer_lock);
+				pthread_mutex_unlock(&priv->timer_lock);
+				continue;
+			}
+		} else {
+			priv->last_traffic_tic = current_tic;
+		}
+		usleep(priv->timer_delay_us);
+	}
+	return NULL;
 }
 
 static void
 mlx5_vdpa_interrupt_handler(void *cb_arg)
 {
-#ifndef HAVE_IBV_DEVX_EVENT
-	(void)cb_arg;
-	return;
-#else
 	struct mlx5_vdpa_priv *priv = cb_arg;
+#ifdef HAVE_IBV_DEVX_EVENT
 	union {
 		struct mlx5dv_devx_async_event_hdr event_resp;
 		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
@@ -221,17 +314,29 @@
 				       (ssize_t)sizeof(out.event_resp.cookie)) {
 		struct mlx5_vdpa_cq *cq = (struct mlx5_vdpa_cq *)
 					       (uintptr_t)out.event_resp.cookie;
-		rte_spinlock_lock(&cq->sl);
-		mlx5_vdpa_cq_poll(priv, cq);
-		mlx5_vdpa_cq_arm(priv, cq);
-		if (cq->callfd != -1)
-			/* Notify guest for descriptors consuming. */
-			eventfd_write(cq->callfd, (eventfd_t)1);
-		rte_spinlock_unlock(&cq->sl);
-		DRV_LOG(DEBUG, "CQ %d event: new cq_ci = %u.", cq->cq->id,
-			cq->cq_ci);
+		struct mlx5_vdpa_event_qp *eqp = container_of(cq,
+						 struct mlx5_vdpa_event_qp, cq);
+		struct mlx5_vdpa_virtq *virtq = container_of(eqp,
+						   struct mlx5_vdpa_virtq, eqp);
+
+		mlx5_vdpa_cq_poll(cq);
+		/* Don't arm again - timer will take control. */
+		DRV_LOG(DEBUG, "Device %d virtq %d cq %d event was captured."
+			" Timer is %s, cq ci is %u.\n", priv->id,
+			(int)virtq->index, cq->cq->id, priv->timer_on ? "on" :
+			"off", cq->cq_ci);
+		cq->armed = 0;
+	}
+#endif
+
+	/* Traffic detected: make sure timer is on. */
+	priv->last_traffic_tic = rte_rdtsc();
+	pthread_mutex_lock(&priv->timer_lock);
+	if (!priv->timer_on) {
+		priv->timer_on = 1;
+		pthread_cond_signal(&priv->timer_cond);
 	}
-#endif /* HAVE_IBV_DEVX_ASYNC */
+	pthread_mutex_unlock(&priv->timer_lock);
 }
 
 int
@@ -243,12 +348,21 @@
 	if (!priv->eventc)
 		/* All virtqs are in poll mode. */
 		return 0;
+	pthread_mutex_init(&priv->timer_lock, NULL);
+	pthread_cond_init(&priv->timer_cond, NULL);
+	priv->timer_on = 0;
+	priv->timer_delay_us = MLX5_VDPA_DEFAULT_TIMER_DELAY_US;
+	ret = pthread_create(&priv->timer_tid, NULL, mlx5_vdpa_poll_handle,
+			     (void *)priv);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to create timer thread.");
+		return -1;
+	}
 	flags = fcntl(priv->eventc->fd, F_GETFL);
 	ret = fcntl(priv->eventc->fd, F_SETFL, flags | O_NONBLOCK);
 	if (ret) {
 		DRV_LOG(ERR, "Failed to change event channel FD.");
-		rte_errno = errno;
-		return -rte_errno;
+		goto error;
 	}
 	priv->intr_handle.fd = priv->eventc->fd;
 	priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
@@ -256,9 +370,12 @@
 				       mlx5_vdpa_interrupt_handler, priv)) {
 		priv->intr_handle.fd = 0;
 		DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
-		return -rte_errno;
+		goto error;
 	}
 	return 0;
+error:
+	mlx5_vdpa_cqe_event_unset(priv);
+	return -1;
 }
 
 void
@@ -266,6 +383,7 @@
 {
 	int retries = MLX5_VDPA_INTR_RETRIES;
 	int ret = -EAGAIN;
+	void *status;
 
 	if (priv->intr_handle.fd) {
 		while (retries-- && ret == -EAGAIN) {
@@ -276,11 +394,16 @@
 				DRV_LOG(DEBUG, "Try again to unregister fd %d "
 					"of CQ interrupt, retries = %d.",
 					priv->intr_handle.fd, retries);
-				usleep(MLX5_VDPA_INTR_RETRIES_USEC);
+				rte_pause();
 			}
 		}
 		memset(&priv->intr_handle, 0, sizeof(priv->intr_handle));
 	}
+	if (priv->timer_tid) {
+		pthread_cancel(priv->timer_tid);
+		pthread_join(priv->timer_tid, &status);
+	}
+	priv->timer_tid = 0;
 }
 
 void
-- 
1.8.3.1


  reply index

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-18 19:11 [dpdk-dev] [PATCH 0/3] vdpa/mlx5: optimize cpu utilization Matan Azrad
2020-06-18 19:11 ` [dpdk-dev] [PATCH 1/3] vdpa/mlx5: optimize notification events Matan Azrad
2020-06-18 19:11 ` [dpdk-dev] [PATCH 2/3] vdpa/mlx5: optimize completion queue poll Matan Azrad
2020-06-18 19:11 ` [dpdk-dev] [PATCH 3/3] vdpa/mlx5: add traffic control device arguments Matan Azrad
2020-06-25 13:30 ` [dpdk-dev] [PATCH v2 0/3] vdpa/mlx5: optimize cpu utilization Matan Azrad
2020-06-25 13:30   ` Matan Azrad [this message]
2020-06-29  9:05     ` [dpdk-dev] [PATCH v2 1/3] vdpa/mlx5: optimize notification events Maxime Coquelin
2020-06-25 13:30   ` [dpdk-dev] [PATCH v2 2/3] vdpa/mlx5: optimize completion queue poll Matan Azrad
2020-06-29  9:11     ` Maxime Coquelin
2020-06-25 13:30   ` [dpdk-dev] [PATCH v2 3/3] vdpa/mlx5: control completion queue event mode Matan Azrad
2020-06-29  9:16     ` Maxime Coquelin
2020-06-29 14:01   ` [dpdk-dev] [PATCH v3 0/3] vdpa/mlx5: optimize cpu utilization Matan Azrad
2020-06-29 14:01     ` [dpdk-dev] [PATCH v3 1/3] vdpa/mlx5: optimize notification events Matan Azrad
2020-06-29 14:01     ` [dpdk-dev] [PATCH v3 2/3] vdpa/mlx5: optimize completion queue poll Matan Azrad
2020-06-29 14:01     ` [dpdk-dev] [PATCH v3 3/3] vdpa/mlx5: control completion queue event mode Matan Azrad
2020-06-29 17:24     ` [dpdk-dev] [PATCH v3 0/3] vdpa/mlx5: optimize cpu utilization Maxime Coquelin

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1593091838-51869-2-git-send-email-matan@mellanox.com \
    --to=matan@mellanox.com \
    --cc=dev@dpdk.org \
    --cc=maxime.coquelin@redhat.com \
    --cc=viacheslavo@mellanox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

DPDK patches and discussions

Archives are clonable:
	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev


Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox