DPDK patches and discussions
 help / color / mirror / Atom feed
From: <pbhagavatula@marvell.com>
To: <jerinj@marvell.com>, Pavan Nikhilesh <pbhagavatula@marvell.com>,
	"Shijith Thotton" <sthotton@marvell.com>,
	Nithin Dabilpuram <ndabilpuram@marvell.com>,
	Kiran Kumar K <kirankumark@marvell.com>,
	Sunil Kumar Kori <skori@marvell.com>,
	Satha Rao <skoteshwar@marvell.com>
Cc: <dev@dpdk.org>
Subject: [PATCH 4/4] net/cnxk: improve Rx performance
Date: Tue, 14 Dec 2021 02:44:24 +0530	[thread overview]
Message-ID: <20211213211425.6332-4-pbhagavatula@marvell.com> (raw)
In-Reply-To: <20211213211425.6332-1-pbhagavatula@marvell.com>

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve vWQE and CQ Rx performance by tuning perfetches to 64B
cacheline size.
Also, prefetch the vWQE array offsets at cacheline boundaries.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_worker.h | 25 +++++++++++++++----------
 drivers/net/cnxk/cn10k_rx.h       |  8 ++++----
 drivers/net/cnxk/cn9k_rx.h        | 20 ++++++++++----------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 65602a632e..6e77d32827 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -118,11 +118,17 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
 	uint8_t loff = 0;
 	uint64_t sa_base;
 	uint64_t **wqe;
+	int i;

 	mbuf_init |= ((uint64_t)port_id) << 48;
 	vec = (struct rte_event_vector *)vwqe;
 	wqe = vec->u64s;

+	rte_prefetch_non_temporal(&vec->ptrs[0]);
+#define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *))
+	for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE)
+		rte_prefetch_non_temporal(&vec->ptrs[i]);
+
 	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
 	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
 					      flags | NIX_RX_VWQE_F, lookup_mem,
@@ -191,15 +197,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		uint64_t u64[2];
 	} gw;
 	uint64_t tstamp_ptr;
-	uint64_t mbuf;

 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
-		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		"sub %[mbuf], %H[wdata], #0x80				\n"
-		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
+		"caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
+		: [wdata] "+r"(gw.get_work)
 		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
@@ -208,14 +212,12 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
-	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	ws->gw_rdata = gw.u64[0];
-	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
-		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
-		    (gw.u64[0] & 0xffffffff);
-
-	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+	if (gw.u64[1]) {
+		gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
+			    (gw.u64[0] & (0x3FFull << 36)) << 4 |
+			    (gw.u64[0] & 0xffffffff);
 		if ((flags & CPT_RX_WQE_F) &&
 		    (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 		     RTE_EVENT_TYPE_CRYPTODEV)) {
@@ -223,7 +225,10 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		} else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
 			   RTE_EVENT_TYPE_ETHDEV) {
 			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+			uint64_t mbuf;

+			mbuf = gw.u64[1] - sizeof(struct rte_mbuf);
+			rte_prefetch0((void *)mbuf);
 			if (flags & NIX_RX_OFFLOAD_SECURITY_F) {
 				struct rte_mbuf *m;
 				uintptr_t sa_base;
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index a2442d3726..9694a3080f 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -610,10 +610,10 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
 		}

 		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
-		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags));

 		/* Get NIX_RX_SG_S for size and buffer pointer */
 		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index b038b1a6ef..fa4efbf80a 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -342,16 +342,16 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags =
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);

-	mbuf->pkt_len = len;
-	mbuf->data_len = len;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
-
 	mbuf->ol_flags = ol_flags;
+	*(uint64_t *)(&mbuf->rearm_data) = val;
+	mbuf->pkt_len = len;

-	if (flag & NIX_RX_MULTI_SEG_F)
+	if (flag & NIX_RX_MULTI_SEG_F) {
 		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
-	else
+	} else {
+		mbuf->data_len = len;
 		mbuf->next = NULL;
+	}
 }

 static inline uint16_t
@@ -723,10 +723,6 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);

-		/* Store the mbufs to rx_pkts */
-		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
-
 		if (flags & NIX_RX_MULTI_SEG_F) {
 			/* Multi segment is enable build mseg list for
 			 * individual mbufs in scalar mode.
@@ -751,6 +747,10 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			mbuf3->next = NULL;
 		}

+		/* Store the mbufs to rx_pkts */
+		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
+		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
--
2.17.1


      parent reply	other threads:[~2021-12-13 21:15 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-13 21:14 [PATCH 1/4] net/cnxk: avoid command copy from Tx queue pbhagavatula
2021-12-13 21:14 ` [PATCH 2/4] event/cnxk: store and reuse workslot status pbhagavatula
2021-12-13 21:14 ` [PATCH 3/4] event/cnxk: disable default wait time for dequeue pbhagavatula
2021-12-13 21:14 ` pbhagavatula [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211213211425.6332-4-pbhagavatula@marvell.com \
    --to=pbhagavatula@marvell.com \
    --cc=dev@dpdk.org \
    --cc=jerinj@marvell.com \
    --cc=kirankumark@marvell.com \
    --cc=ndabilpuram@marvell.com \
    --cc=skori@marvell.com \
    --cc=skoteshwar@marvell.com \
    --cc=sthotton@marvell.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).