* [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction @ 2022-02-24 13:52 pbhagavatula 2022-02-24 13:52 ` [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model pbhagavatula 2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula 0 siblings, 2 replies; 7+ messages in thread From: pbhagavatula @ 2022-02-24 13:52 UTC (permalink / raw) To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao Cc: dev, Pavan Nikhilesh From: Pavan Nikhilesh <pbhagavatula@marvell.com> In vWQE mode, the mbuf address is calculated without using the iova list. Packet length can also be calculated by using NIX_PARSE_S by which we can completely eliminate reading 2nd cache line depending on the offloads enabled. Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> --- drivers/net/cnxk/cn10k_rx.h | 75 +++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h index abf280102b..65a08e379b 100644 --- a/drivers/net/cnxk/cn10k_rx.h +++ b/drivers/net/cnxk/cn10k_rx.h @@ -590,7 +590,7 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, *(uint64_t *)args : rxq->mbuf_initializer; const uint64x2_t data_off = flags & NIX_RX_VWQE_F ? - vdupq_n_u64(0x80ULL) : + vdupq_n_u64(RTE_PKTMBUF_HEADROOM) : vdupq_n_u64(rxq->data_off); const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask; const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata; @@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); if (!(flags & NIX_RX_VWQE_F)) { + /* Get NIX_RX_SG_S for size and buffer pointer */ + cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); + cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); + cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); + cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); + /* Extract mbuf from NIX_RX_SG_S */ mbuf01 = vzip2q_u64(cq0_w8, cq1_w8); mbuf23 = vzip2q_u64(cq2_w8, cq3_w8); @@ -705,21 +711,24 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0); mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1); - /* Mask to get packet len from NIX_RX_SG_S */ - const uint8x16_t shuf_msk = { - 0xFF, 0xFF, /* pkt_type set as unknown */ - 0xFF, 0xFF, /* pkt_type set as unknown */ - 0, 1, /* octet 1~0, low 16 bits pkt_len */ - 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ - 0, 1, /* octet 1~0, 16 bits data_len */ - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; - - /* Form the rx_descriptor_fields1 with pkt_len and data_len */ - f0 = vqtbl1q_u8(cq0_w8, shuf_msk); - f1 = vqtbl1q_u8(cq1_w8, shuf_msk); - f2 = vqtbl1q_u8(cq2_w8, shuf_msk); - f3 = vqtbl1q_u8(cq3_w8, shuf_msk); - + if (!(flags & NIX_RX_VWQE_F)) { + /* Mask to get packet len from NIX_RX_SG_S */ + const uint8x16_t shuf_msk = { + 0xFF, 0xFF, /* pkt_type set as unknown */ + 0xFF, 0xFF, /* pkt_type set as unknown */ + 0, 1, /* octet 1~0, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero + out */ + 0, 1, /* octet 1~0, 16 bits data_len */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; + + /* Form the rx_descriptor_fields1 with pkt_len and + * data_len */ + f0 = vqtbl1q_u8(cq0_w8, shuf_msk); + f1 = vqtbl1q_u8(cq1_w8, shuf_msk); + f2 = vqtbl1q_u8(cq2_w8, shuf_msk); + f3 = vqtbl1q_u8(cq3_w8, shuf_msk); + } if (flags & NIX_RX_OFFLOAD_SECURITY_F) { /* Prefetch probable CPT parse header area */ rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off)); @@ -731,12 +740,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, /* Load CQE word0 and word 1 */ const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags); const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags); + const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags); const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags); const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags); + const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags); const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags); const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags); + const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags); const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags); const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags); + const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags); + + if (flags & NIX_RX_VWQE_F) { + uint16_t psize0, psize1, psize2, psize3; + + psize0 = (cq0_w2 & 0xFFFF) + 1; + psize1 = (cq1_w2 & 0xFFFF) + 1; + psize2 = (cq2_w2 & 0xFFFF) + 1; + psize3 = (cq3_w2 & 0xFFFF) + 1; + + f0 = vdupq_n_u64(0); + f1 = vdupq_n_u64(0); + f2 = vdupq_n_u64(0); + f3 = vdupq_n_u64(0); + + f0 = vsetq_lane_u16(psize0, f0, 2); + f0 = vsetq_lane_u16(psize0, f0, 4); + + f1 = vsetq_lane_u16(psize1, f1, 2); + f1 = vsetq_lane_u16(psize1, f1, 4); + + f2 = vsetq_lane_u16(psize2, f2, 2); + f2 = vsetq_lane_u16(psize2, f2, 4); + + f3 = vsetq_lane_u16(psize3, f3, 2); + f3 = vsetq_lane_u16(psize3, f3, 4); + } if (flags & NIX_RX_OFFLOAD_RSS_F) { /* Fill rss in the rx_descriptor_fields1 */ @@ -805,10 +844,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, } if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) { - uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16); - uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16); - uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16); - uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16); ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0); ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1); -- 2.17.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model 2022-02-24 13:52 [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula @ 2022-02-24 13:52 ` pbhagavatula 2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula 1 sibling, 0 replies; 7+ messages in thread From: pbhagavatula @ 2022-02-24 13:52 UTC (permalink / raw) To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao Cc: dev From: Pavan Nikhilesh <pbhagavatula@marvell.com> Align perfetchs for CN10K cache model for vWQE in Rx and Tx. Move mbuf->next NULL assignment to Tx path and enabled it only when multi segments offload is enabled to reduce L1 pressure. Add macros to detect corrupted mbuf->next values when MEMPOOL_DEBUG is set. Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> --- drivers/event/cnxk/cn10k_worker.h | 13 ++-- drivers/net/cnxk/cn10k_rx.h | 115 ++++++++++++++++++++++++------ drivers/net/cnxk/cn10k_tx.h | 7 ++ 3 files changed, 107 insertions(+), 28 deletions(-) diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h index d288c66cac..a827a1e422 100644 --- a/drivers/event/cnxk/cn10k_worker.h +++ b/drivers/event/cnxk/cn10k_worker.h @@ -118,23 +118,23 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, uint64_t aura_handle, laddr; uint16_t nb_mbufs, non_vec; uint16_t lmt_id, d_off; + struct rte_mbuf **wqe; struct rte_mbuf *mbuf; uint8_t loff = 0; uint64_t sa_base; - uint64_t **wqe; int i; mbuf_init |= ((uint64_t)port_id) << 48; vec = (struct rte_event_vector *)vwqe; - wqe = vec->u64s; + wqe = vec->mbufs; - rte_prefetch_non_temporal(&vec->ptrs[0]); + rte_prefetch0(&vec->ptrs[0]); #define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *)) for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE) - rte_prefetch_non_temporal(&vec->ptrs[i]); + rte_prefetch0(&vec->ptrs[i]); nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP); - nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs, + nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs, flags | NIX_RX_VWQE_F, lookup_mem, tstamp, lbase); wqe += nb_mbufs; @@ -182,7 +182,7 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, cn10k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp, flags & NIX_RX_OFFLOAD_TSTAMP_F, (uint64_t *)tstamp_ptr); - wqe[0] = (uint64_t *)mbuf; + wqe[0] = (struct rte_mbuf *)mbuf; non_vec--; wqe++; } @@ -612,6 +612,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev, ev->sched_type, txq_data, flags); } rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec); + rte_prefetch0(ws); return (meta & 0xFFFF); } diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h index 65a08e379b..66a35c69f9 100644 --- a/drivers/net/cnxk/cn10k_rx.h +++ b/drivers/net/cnxk/cn10k_rx.h @@ -36,6 +36,22 @@ (((f) & NIX_RX_VWQE_F) ? \ (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \ (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o))) +#define CQE_PTR_DIFF(b, i, o, f) \ + (((f) & NIX_RX_VWQE_F) ? \ + (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) - (o)) : \ + (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) - (o))) + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +#define NIX_MBUF_VALIDATE_NEXT(m) \ + if (m->nb_segs == 1 && mbuf->next) { \ + rte_panic("mbuf->next[%p] valid when mbuf->nb_segs is %d", \ + m->next, m->nb_segs); \ + } +#else +#define NIX_MBUF_VALIDATE_NEXT(m) \ + do { \ + } while (0) +#endif union mbuf_initializer { struct { @@ -674,17 +690,73 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, cq0 = (uintptr_t)&mbufs[packets]; } - /* Prefetch N desc ahead */ - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags)); - - /* Get NIX_RX_SG_S for size and buffer pointer */ - cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); - cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); - cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); - cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); + if (flags & NIX_RX_VWQE_F) { + if (pkts - packets > 4) { + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 4, 0, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 5, 0, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 6, 0, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 7, 0, flags)); + + if (likely(pkts - packets > 8)) { + rte_prefetch1( + CQE_PTR_OFF(cq0, 8, 0, flags)); + rte_prefetch1( + CQE_PTR_OFF(cq0, 9, 0, flags)); + rte_prefetch1( + CQE_PTR_OFF(cq0, 10, 0, flags)); + rte_prefetch1( + CQE_PTR_OFF(cq0, 11, 0, flags)); + if (pkts - packets > 12) { + rte_prefetch1(CQE_PTR_OFF( + cq0, 12, 0, flags)); + rte_prefetch1(CQE_PTR_OFF( + cq0, 13, 0, flags)); + rte_prefetch1(CQE_PTR_OFF( + cq0, 14, 0, flags)); + rte_prefetch1(CQE_PTR_OFF( + cq0, 15, 0, flags)); + } + } + + rte_prefetch0(CQE_PTR_DIFF( + cq0, 4, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 5, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 6, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 7, RTE_PKTMBUF_HEADROOM, flags)); + if (likely(pkts - packets > 8)) { + rte_prefetch0(CQE_PTR_DIFF( + cq0, 8, RTE_PKTMBUF_HEADROOM, + flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 9, RTE_PKTMBUF_HEADROOM, + flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 10, RTE_PKTMBUF_HEADROOM, + flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 11, RTE_PKTMBUF_HEADROOM, + flags)); + } + } + } else { + if (pkts - packets > 4) { + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 4, 64, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 5, 64, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 6, 64, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 7, 64, flags)); + } + } if (!(flags & NIX_RX_VWQE_F)) { /* Get NIX_RX_SG_S for size and buffer pointer */ @@ -997,19 +1069,18 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, nix_cqe_xtract_mseg((union nix_rx_parse_u *) (CQE_PTR_OFF(cq0, 3, 8, flags)), mbuf3, mbuf_initializer, flags); - } else { - /* Update that no more segments */ - mbuf0->next = NULL; - mbuf1->next = NULL; - mbuf2->next = NULL; - mbuf3->next = NULL; } - /* Prefetch mbufs */ - roc_prefetch_store_keep(mbuf0); - roc_prefetch_store_keep(mbuf1); - roc_prefetch_store_keep(mbuf2); - roc_prefetch_store_keep(mbuf3); + /* Mark mempool obj as "get" as it is alloc'ed by NIX */ + RTE_MEMPOOL_CHECK_COOKIES(mbuf0->pool, (void **)&mbuf0, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf1->pool, (void **)&mbuf1, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf2->pool, (void **)&mbuf2, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf3->pool, (void **)&mbuf3, 1, 1); + + NIX_MBUF_VALIDATE_NEXT(mbuf0); + NIX_MBUF_VALIDATE_NEXT(mbuf1); + NIX_MBUF_VALIDATE_NEXT(mbuf2); + NIX_MBUF_VALIDATE_NEXT(mbuf3); packets += NIX_DESCS_PER_LOOP; diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h index ec6366168c..695e3ed354 100644 --- a/drivers/net/cnxk/cn10k_tx.h +++ b/drivers/net/cnxk/cn10k_tx.h @@ -2569,6 +2569,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws, lnum += 1; } + if (flags & NIX_TX_MULTI_SEG_F) { + tx_pkts[0]->next = NULL; + tx_pkts[1]->next = NULL; + tx_pkts[2]->next = NULL; + tx_pkts[3]->next = NULL; + } + tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP; } -- 2.17.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction 2022-02-24 13:52 [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula 2022-02-24 13:52 ` [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model pbhagavatula @ 2022-02-24 16:10 ` pbhagavatula 2022-02-24 16:10 ` [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model pbhagavatula 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj 1 sibling, 2 replies; 7+ messages in thread From: pbhagavatula @ 2022-02-24 16:10 UTC (permalink / raw) To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao Cc: dev, Pavan Nikhilesh From: Pavan Nikhilesh <pbhagavatula@marvell.com> In vWQE mode, the mbuf address is calculated without using the iova list. Packet length can also be calculated by using NIX_PARSE_S by which we can completely eliminate reading 2nd cache line depending on the offloads enabled. Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> --- v2 Changes: - Reword commit message. drivers/net/cnxk/cn10k_rx.h | 75 +++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h index abf280102b..65a08e379b 100644 --- a/drivers/net/cnxk/cn10k_rx.h +++ b/drivers/net/cnxk/cn10k_rx.h @@ -590,7 +590,7 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, *(uint64_t *)args : rxq->mbuf_initializer; const uint64x2_t data_off = flags & NIX_RX_VWQE_F ? - vdupq_n_u64(0x80ULL) : + vdupq_n_u64(RTE_PKTMBUF_HEADROOM) : vdupq_n_u64(rxq->data_off); const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask; const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata; @@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); if (!(flags & NIX_RX_VWQE_F)) { + /* Get NIX_RX_SG_S for size and buffer pointer */ + cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); + cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); + cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); + cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); + /* Extract mbuf from NIX_RX_SG_S */ mbuf01 = vzip2q_u64(cq0_w8, cq1_w8); mbuf23 = vzip2q_u64(cq2_w8, cq3_w8); @@ -705,21 +711,24 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0); mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1); - /* Mask to get packet len from NIX_RX_SG_S */ - const uint8x16_t shuf_msk = { - 0xFF, 0xFF, /* pkt_type set as unknown */ - 0xFF, 0xFF, /* pkt_type set as unknown */ - 0, 1, /* octet 1~0, low 16 bits pkt_len */ - 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ - 0, 1, /* octet 1~0, 16 bits data_len */ - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; - - /* Form the rx_descriptor_fields1 with pkt_len and data_len */ - f0 = vqtbl1q_u8(cq0_w8, shuf_msk); - f1 = vqtbl1q_u8(cq1_w8, shuf_msk); - f2 = vqtbl1q_u8(cq2_w8, shuf_msk); - f3 = vqtbl1q_u8(cq3_w8, shuf_msk); - + if (!(flags & NIX_RX_VWQE_F)) { + /* Mask to get packet len from NIX_RX_SG_S */ + const uint8x16_t shuf_msk = { + 0xFF, 0xFF, /* pkt_type set as unknown */ + 0xFF, 0xFF, /* pkt_type set as unknown */ + 0, 1, /* octet 1~0, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero + out */ + 0, 1, /* octet 1~0, 16 bits data_len */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; + + /* Form the rx_descriptor_fields1 with pkt_len and + * data_len */ + f0 = vqtbl1q_u8(cq0_w8, shuf_msk); + f1 = vqtbl1q_u8(cq1_w8, shuf_msk); + f2 = vqtbl1q_u8(cq2_w8, shuf_msk); + f3 = vqtbl1q_u8(cq3_w8, shuf_msk); + } if (flags & NIX_RX_OFFLOAD_SECURITY_F) { /* Prefetch probable CPT parse header area */ rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off)); @@ -731,12 +740,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, /* Load CQE word0 and word 1 */ const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags); const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags); + const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags); const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags); const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags); + const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags); const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags); const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags); + const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags); const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags); const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags); + const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags); + + if (flags & NIX_RX_VWQE_F) { + uint16_t psize0, psize1, psize2, psize3; + + psize0 = (cq0_w2 & 0xFFFF) + 1; + psize1 = (cq1_w2 & 0xFFFF) + 1; + psize2 = (cq2_w2 & 0xFFFF) + 1; + psize3 = (cq3_w2 & 0xFFFF) + 1; + + f0 = vdupq_n_u64(0); + f1 = vdupq_n_u64(0); + f2 = vdupq_n_u64(0); + f3 = vdupq_n_u64(0); + + f0 = vsetq_lane_u16(psize0, f0, 2); + f0 = vsetq_lane_u16(psize0, f0, 4); + + f1 = vsetq_lane_u16(psize1, f1, 2); + f1 = vsetq_lane_u16(psize1, f1, 4); + + f2 = vsetq_lane_u16(psize2, f2, 2); + f2 = vsetq_lane_u16(psize2, f2, 4); + + f3 = vsetq_lane_u16(psize3, f3, 2); + f3 = vsetq_lane_u16(psize3, f3, 4); + } if (flags & NIX_RX_OFFLOAD_RSS_F) { /* Fill rss in the rx_descriptor_fields1 */ @@ -805,10 +844,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, } if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) { - uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16); - uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16); - uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16); - uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16); ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0); ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1); -- 2.17.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model 2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula @ 2022-02-24 16:10 ` pbhagavatula 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj 1 sibling, 0 replies; 7+ messages in thread From: pbhagavatula @ 2022-02-24 16:10 UTC (permalink / raw) To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao Cc: dev From: Pavan Nikhilesh <pbhagavatula@marvell.com> Align perfetchs for CN10K cache model for vWQE in Rx and Tx. Move mbuf->next NULL assignment to Tx path and enabled it only when multi segments offload is enabled to reduce L1 pressure. Add macros to detect corrupted mbuf->next values when MEMPOOL_DEBUG is set. Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> --- drivers/event/cnxk/cn10k_worker.h | 13 ++-- drivers/net/cnxk/cn10k_rx.h | 115 ++++++++++++++++++++++++------ drivers/net/cnxk/cn10k_tx.h | 7 ++ 3 files changed, 107 insertions(+), 28 deletions(-) diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h index d288c66cac..a827a1e422 100644 --- a/drivers/event/cnxk/cn10k_worker.h +++ b/drivers/event/cnxk/cn10k_worker.h @@ -118,23 +118,23 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, uint64_t aura_handle, laddr; uint16_t nb_mbufs, non_vec; uint16_t lmt_id, d_off; + struct rte_mbuf **wqe; struct rte_mbuf *mbuf; uint8_t loff = 0; uint64_t sa_base; - uint64_t **wqe; int i; mbuf_init |= ((uint64_t)port_id) << 48; vec = (struct rte_event_vector *)vwqe; - wqe = vec->u64s; + wqe = vec->mbufs; - rte_prefetch_non_temporal(&vec->ptrs[0]); + rte_prefetch0(&vec->ptrs[0]); #define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *)) for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE) - rte_prefetch_non_temporal(&vec->ptrs[i]); + rte_prefetch0(&vec->ptrs[i]); nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP); - nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs, + nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs, flags | NIX_RX_VWQE_F, lookup_mem, tstamp, lbase); wqe += nb_mbufs; @@ -182,7 +182,7 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, cn10k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp, flags & NIX_RX_OFFLOAD_TSTAMP_F, (uint64_t *)tstamp_ptr); - wqe[0] = (uint64_t *)mbuf; + wqe[0] = (struct rte_mbuf *)mbuf; non_vec--; wqe++; } @@ -612,6 +612,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev, ev->sched_type, txq_data, flags); } rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec); + rte_prefetch0(ws); return (meta & 0xFFFF); } diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h index 65a08e379b..66a35c69f9 100644 --- a/drivers/net/cnxk/cn10k_rx.h +++ b/drivers/net/cnxk/cn10k_rx.h @@ -36,6 +36,22 @@ (((f) & NIX_RX_VWQE_F) ? \ (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \ (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o))) +#define CQE_PTR_DIFF(b, i, o, f) \ + (((f) & NIX_RX_VWQE_F) ? \ + (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) - (o)) : \ + (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) - (o))) + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +#define NIX_MBUF_VALIDATE_NEXT(m) \ + if (m->nb_segs == 1 && mbuf->next) { \ + rte_panic("mbuf->next[%p] valid when mbuf->nb_segs is %d", \ + m->next, m->nb_segs); \ + } +#else +#define NIX_MBUF_VALIDATE_NEXT(m) \ + do { \ + } while (0) +#endif union mbuf_initializer { struct { @@ -674,17 +690,73 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, cq0 = (uintptr_t)&mbufs[packets]; } - /* Prefetch N desc ahead */ - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags)); - - /* Get NIX_RX_SG_S for size and buffer pointer */ - cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); - cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); - cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); - cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); + if (flags & NIX_RX_VWQE_F) { + if (pkts - packets > 4) { + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 4, 0, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 5, 0, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 6, 0, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 7, 0, flags)); + + if (likely(pkts - packets > 8)) { + rte_prefetch1( + CQE_PTR_OFF(cq0, 8, 0, flags)); + rte_prefetch1( + CQE_PTR_OFF(cq0, 9, 0, flags)); + rte_prefetch1( + CQE_PTR_OFF(cq0, 10, 0, flags)); + rte_prefetch1( + CQE_PTR_OFF(cq0, 11, 0, flags)); + if (pkts - packets > 12) { + rte_prefetch1(CQE_PTR_OFF( + cq0, 12, 0, flags)); + rte_prefetch1(CQE_PTR_OFF( + cq0, 13, 0, flags)); + rte_prefetch1(CQE_PTR_OFF( + cq0, 14, 0, flags)); + rte_prefetch1(CQE_PTR_OFF( + cq0, 15, 0, flags)); + } + } + + rte_prefetch0(CQE_PTR_DIFF( + cq0, 4, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 5, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 6, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 7, RTE_PKTMBUF_HEADROOM, flags)); + if (likely(pkts - packets > 8)) { + rte_prefetch0(CQE_PTR_DIFF( + cq0, 8, RTE_PKTMBUF_HEADROOM, + flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 9, RTE_PKTMBUF_HEADROOM, + flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 10, RTE_PKTMBUF_HEADROOM, + flags)); + rte_prefetch0(CQE_PTR_DIFF( + cq0, 11, RTE_PKTMBUF_HEADROOM, + flags)); + } + } + } else { + if (pkts - packets > 4) { + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 4, 64, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 5, 64, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 6, 64, flags)); + rte_prefetch_non_temporal( + CQE_PTR_OFF(cq0, 7, 64, flags)); + } + } if (!(flags & NIX_RX_VWQE_F)) { /* Get NIX_RX_SG_S for size and buffer pointer */ @@ -997,19 +1069,18 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, nix_cqe_xtract_mseg((union nix_rx_parse_u *) (CQE_PTR_OFF(cq0, 3, 8, flags)), mbuf3, mbuf_initializer, flags); - } else { - /* Update that no more segments */ - mbuf0->next = NULL; - mbuf1->next = NULL; - mbuf2->next = NULL; - mbuf3->next = NULL; } - /* Prefetch mbufs */ - roc_prefetch_store_keep(mbuf0); - roc_prefetch_store_keep(mbuf1); - roc_prefetch_store_keep(mbuf2); - roc_prefetch_store_keep(mbuf3); + /* Mark mempool obj as "get" as it is alloc'ed by NIX */ + RTE_MEMPOOL_CHECK_COOKIES(mbuf0->pool, (void **)&mbuf0, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf1->pool, (void **)&mbuf1, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf2->pool, (void **)&mbuf2, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf3->pool, (void **)&mbuf3, 1, 1); + + NIX_MBUF_VALIDATE_NEXT(mbuf0); + NIX_MBUF_VALIDATE_NEXT(mbuf1); + NIX_MBUF_VALIDATE_NEXT(mbuf2); + NIX_MBUF_VALIDATE_NEXT(mbuf3); packets += NIX_DESCS_PER_LOOP; diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h index ec6366168c..695e3ed354 100644 --- a/drivers/net/cnxk/cn10k_tx.h +++ b/drivers/net/cnxk/cn10k_tx.h @@ -2569,6 +2569,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws, lnum += 1; } + if (flags & NIX_TX_MULTI_SEG_F) { + tx_pkts[0]->next = NULL; + tx_pkts[1]->next = NULL; + tx_pkts[2]->next = NULL; + tx_pkts[3]->next = NULL; + } + tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP; } -- 2.17.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction 2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula 2022-02-24 16:10 ` [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model pbhagavatula @ 2022-02-24 18:40 ` jerinj 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model jerinj 2022-02-24 20:39 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction Jerin Jacob 1 sibling, 2 replies; 7+ messages in thread From: jerinj @ 2022-02-24 18:40 UTC (permalink / raw) To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao Cc: Pavan Nikhilesh, Jerin Jacob From: Pavan Nikhilesh <pbhagavatula@marvell.com> In vWQE mode, the mbuf address is calculated without using the iova list. Packet length can also be calculated by using NIX_PARSE_S by which we can completely eliminate reading 2nd cache line depending on the offloads enabled. Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> Acked-by: Jerin Jacob <jerinj@marvell.com> --- v3: - Change NIX_MBUF_VALIDATE_NEXT macro to inline function - Fixed the relevant checkpatch warning at http://mails.dpdk.org/archives/test-report/2022-February/264235.html v2 : - Reword commit message. drivers/net/cnxk/cn10k_rx.h | 73 +++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h index abf280102b..236a1dca6e 100644 --- a/drivers/net/cnxk/cn10k_rx.h +++ b/drivers/net/cnxk/cn10k_rx.h @@ -590,8 +590,8 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, *(uint64_t *)args : rxq->mbuf_initializer; const uint64x2_t data_off = flags & NIX_RX_VWQE_F ? - vdupq_n_u64(0x80ULL) : - vdupq_n_u64(rxq->data_off); + vdupq_n_u64(RTE_PKTMBUF_HEADROOM) : + vdupq_n_u64(rxq->data_off); const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask; const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata; const uintptr_t desc = flags & NIX_RX_VWQE_F ? 0 : rxq->desc; @@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); if (!(flags & NIX_RX_VWQE_F)) { + /* Get NIX_RX_SG_S for size and buffer pointer */ + cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); + cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); + cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); + cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); + /* Extract mbuf from NIX_RX_SG_S */ mbuf01 = vzip2q_u64(cq0_w8, cq1_w8); mbuf23 = vzip2q_u64(cq2_w8, cq3_w8); @@ -705,21 +711,22 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0); mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1); - /* Mask to get packet len from NIX_RX_SG_S */ - const uint8x16_t shuf_msk = { - 0xFF, 0xFF, /* pkt_type set as unknown */ - 0xFF, 0xFF, /* pkt_type set as unknown */ - 0, 1, /* octet 1~0, low 16 bits pkt_len */ - 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ - 0, 1, /* octet 1~0, 16 bits data_len */ - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; - - /* Form the rx_descriptor_fields1 with pkt_len and data_len */ - f0 = vqtbl1q_u8(cq0_w8, shuf_msk); - f1 = vqtbl1q_u8(cq1_w8, shuf_msk); - f2 = vqtbl1q_u8(cq2_w8, shuf_msk); - f3 = vqtbl1q_u8(cq3_w8, shuf_msk); + if (!(flags & NIX_RX_VWQE_F)) { + /* Mask to get packet len from NIX_RX_SG_S */ + const uint8x16_t shuf_msk = { + 0xFF, 0xFF, /* pkt_type set as unknown */ + 0xFF, 0xFF, /* pkt_type set as unknown */ + 0, 1, /* octet 1~0, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16it pkt_len, zero out */ + 0, 1, /* octet 1~0, 16 bits data_len */ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; + /* Form the rx_descriptor_fields1 with pkt_len and data_len */ + f0 = vqtbl1q_u8(cq0_w8, shuf_msk); + f1 = vqtbl1q_u8(cq1_w8, shuf_msk); + f2 = vqtbl1q_u8(cq2_w8, shuf_msk); + f3 = vqtbl1q_u8(cq3_w8, shuf_msk); + } if (flags & NIX_RX_OFFLOAD_SECURITY_F) { /* Prefetch probable CPT parse header area */ rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off)); @@ -731,12 +738,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, /* Load CQE word0 and word 1 */ const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags); const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags); + const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags); const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags); const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags); + const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags); const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags); const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags); + const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags); const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags); const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags); + const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags); + + if (flags & NIX_RX_VWQE_F) { + uint16_t psize0, psize1, psize2, psize3; + + psize0 = (cq0_w2 & 0xFFFF) + 1; + psize1 = (cq1_w2 & 0xFFFF) + 1; + psize2 = (cq2_w2 & 0xFFFF) + 1; + psize3 = (cq3_w2 & 0xFFFF) + 1; + + f0 = vdupq_n_u64(0); + f1 = vdupq_n_u64(0); + f2 = vdupq_n_u64(0); + f3 = vdupq_n_u64(0); + + f0 = vsetq_lane_u16(psize0, f0, 2); + f0 = vsetq_lane_u16(psize0, f0, 4); + + f1 = vsetq_lane_u16(psize1, f1, 2); + f1 = vsetq_lane_u16(psize1, f1, 4); + + f2 = vsetq_lane_u16(psize2, f2, 2); + f2 = vsetq_lane_u16(psize2, f2, 4); + + f3 = vsetq_lane_u16(psize3, f3, 2); + f3 = vsetq_lane_u16(psize3, f3, 4); + } if (flags & NIX_RX_OFFLOAD_RSS_F) { /* Fill rss in the rx_descriptor_fields1 */ @@ -805,10 +842,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, } if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) { - uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16); - uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16); - uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16); - uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16); ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0); ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1); -- 2.35.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj @ 2022-02-24 18:40 ` jerinj 2022-02-24 20:39 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction Jerin Jacob 1 sibling, 0 replies; 7+ messages in thread From: jerinj @ 2022-02-24 18:40 UTC (permalink / raw) To: dev, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao Cc: Jerin Jacob From: Pavan Nikhilesh <pbhagavatula@marvell.com> Align perfetchs for CN10K cache model for vWQE in Rx and Tx. Move mbuf->next NULL assignment to Tx path and enabled it only when multi segments offload is enabled to reduce L1 pressure. Add macros to detect corrupted mbuf->next values when MEMPOOL_DEBUG is set. Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> Acked-by: Jerin Jacob <jerinj@marvell.com> --- drivers/event/cnxk/cn10k_worker.h | 13 ++-- drivers/net/cnxk/cn10k_rx.h | 111 ++++++++++++++++++++++++------ drivers/net/cnxk/cn10k_tx.h | 7 ++ 3 files changed, 104 insertions(+), 27 deletions(-) diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h index d288c66cac..a827a1e422 100644 --- a/drivers/event/cnxk/cn10k_worker.h +++ b/drivers/event/cnxk/cn10k_worker.h @@ -118,23 +118,23 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, uint64_t aura_handle, laddr; uint16_t nb_mbufs, non_vec; uint16_t lmt_id, d_off; + struct rte_mbuf **wqe; struct rte_mbuf *mbuf; uint8_t loff = 0; uint64_t sa_base; - uint64_t **wqe; int i; mbuf_init |= ((uint64_t)port_id) << 48; vec = (struct rte_event_vector *)vwqe; - wqe = vec->u64s; + wqe = vec->mbufs; - rte_prefetch_non_temporal(&vec->ptrs[0]); + rte_prefetch0(&vec->ptrs[0]); #define OBJS_PER_CLINE (RTE_CACHE_LINE_SIZE / sizeof(void *)) for (i = OBJS_PER_CLINE; i < vec->nb_elem; i += OBJS_PER_CLINE) - rte_prefetch_non_temporal(&vec->ptrs[i]); + rte_prefetch0(&vec->ptrs[i]); nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP); - nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs, + nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, wqe, nb_mbufs, flags | NIX_RX_VWQE_F, lookup_mem, tstamp, lbase); wqe += nb_mbufs; @@ -182,7 +182,7 @@ cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags, cn10k_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp, flags & NIX_RX_OFFLOAD_TSTAMP_F, (uint64_t *)tstamp_ptr); - wqe[0] = (uint64_t *)mbuf; + wqe[0] = (struct rte_mbuf *)mbuf; non_vec--; wqe++; } @@ -612,6 +612,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev, ev->sched_type, txq_data, flags); } rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec); + rte_prefetch0(ws); return (meta & 0xFFFF); } diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h index 236a1dca6e..de5e41483b 100644 --- a/drivers/net/cnxk/cn10k_rx.h +++ b/drivers/net/cnxk/cn10k_rx.h @@ -36,6 +36,27 @@ (((f) & NIX_RX_VWQE_F) ? \ (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \ (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o))) +#define CQE_PTR_DIFF(b, i, o, f) \ + (((f) & NIX_RX_VWQE_F) ? \ + (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) - (o)) : \ + (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) - (o))) + +#ifdef RTE_LIBRTE_MEMPOOL_DEBUG +static inline void +nix_mbuf_validate_next(struct rte_mbuf *m) +{ + if (m->nb_segs == 1 && m->next) { + rte_panic("mbuf->next[%p] valid when mbuf->nb_segs is %d", + m->next, m->nb_segs); + } +} +#else +static inline void +nix_mbuf_validate_next(struct rte_mbuf *m) +{ + RTE_SET_USED(m); +} +#endif union mbuf_initializer { struct { @@ -674,17 +695,66 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, cq0 = (uintptr_t)&mbufs[packets]; } - /* Prefetch N desc ahead */ - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags)); - rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags)); + if (flags & NIX_RX_VWQE_F) { + if (pkts - packets > 4) { + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 4, 0, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 5, 0, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 6, 0, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, + 7, 0, flags)); - /* Get NIX_RX_SG_S for size and buffer pointer */ - cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); - cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); - cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); - cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); + if (likely(pkts - packets > 8)) { + rte_prefetch1(CQE_PTR_OFF(cq0, + 8, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 9, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 10, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 11, 0, flags)); + if (pkts - packets > 12) { + rte_prefetch1(CQE_PTR_OFF(cq0, + 12, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 13, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 14, 0, flags)); + rte_prefetch1(CQE_PTR_OFF(cq0, + 15, 0, flags)); + } + } + + rte_prefetch0(CQE_PTR_DIFF(cq0, + 4, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 5, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 6, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 7, RTE_PKTMBUF_HEADROOM, flags)); + + if (likely(pkts - packets > 8)) { + rte_prefetch0(CQE_PTR_DIFF(cq0, + 8, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 9, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 10, RTE_PKTMBUF_HEADROOM, flags)); + rte_prefetch0(CQE_PTR_DIFF(cq0, + 11, RTE_PKTMBUF_HEADROOM, flags)); + } + } + } else { + if (pkts - packets > 4) { + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 4, 64, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 5, 64, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 6, 64, flags)); + rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 7, 64, flags)); + } + } if (!(flags & NIX_RX_VWQE_F)) { /* Get NIX_RX_SG_S for size and buffer pointer */ @@ -995,19 +1065,18 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, nix_cqe_xtract_mseg((union nix_rx_parse_u *) (CQE_PTR_OFF(cq0, 3, 8, flags)), mbuf3, mbuf_initializer, flags); - } else { - /* Update that no more segments */ - mbuf0->next = NULL; - mbuf1->next = NULL; - mbuf2->next = NULL; - mbuf3->next = NULL; } - /* Prefetch mbufs */ - roc_prefetch_store_keep(mbuf0); - roc_prefetch_store_keep(mbuf1); - roc_prefetch_store_keep(mbuf2); - roc_prefetch_store_keep(mbuf3); + /* Mark mempool obj as "get" as it is alloc'ed by NIX */ + RTE_MEMPOOL_CHECK_COOKIES(mbuf0->pool, (void **)&mbuf0, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf1->pool, (void **)&mbuf1, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf2->pool, (void **)&mbuf2, 1, 1); + RTE_MEMPOOL_CHECK_COOKIES(mbuf3->pool, (void **)&mbuf3, 1, 1); + + nix_mbuf_validate_next(mbuf0); + nix_mbuf_validate_next(mbuf1); + nix_mbuf_validate_next(mbuf2); + nix_mbuf_validate_next(mbuf3); packets += NIX_DESCS_PER_LOOP; diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h index ec6366168c..695e3ed354 100644 --- a/drivers/net/cnxk/cn10k_tx.h +++ b/drivers/net/cnxk/cn10k_tx.h @@ -2569,6 +2569,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws, lnum += 1; } + if (flags & NIX_TX_MULTI_SEG_F) { + tx_pkts[0]->next = NULL; + tx_pkts[1]->next = NULL; + tx_pkts[2]->next = NULL; + tx_pkts[3]->next = NULL; + } + tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP; } -- 2.35.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model jerinj @ 2022-02-24 20:39 ` Jerin Jacob 1 sibling, 0 replies; 7+ messages in thread From: Jerin Jacob @ 2022-02-24 20:39 UTC (permalink / raw) To: Jerin Jacob Cc: dpdk-dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao, Pavan Nikhilesh On Fri, Feb 25, 2022 at 12:09 AM <jerinj@marvell.com> wrote: > > From: Pavan Nikhilesh <pbhagavatula@marvell.com> > > In vWQE mode, the mbuf address is calculated without using the > iova list. > > Packet length can also be calculated by using NIX_PARSE_S by > which we can completely eliminate reading 2nd cache line > depending on the offloads enabled. > > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com> > Acked-by: Jerin Jacob <jerinj@marvell.com> > --- > v3: > - Change NIX_MBUF_VALIDATE_NEXT macro to inline function > - Fixed the relevant checkpatch warning at > http://mails.dpdk.org/archives/test-report/2022-February/264235.html Series applied to dpdk-next-net-mrvl/for-next-net. Thanks. > > v2 : > - Reword commit message. > > drivers/net/cnxk/cn10k_rx.h | 73 +++++++++++++++++++++++++++---------- > 1 file changed, 53 insertions(+), 20 deletions(-) > > diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h > index abf280102b..236a1dca6e 100644 > --- a/drivers/net/cnxk/cn10k_rx.h > +++ b/drivers/net/cnxk/cn10k_rx.h > @@ -590,8 +590,8 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, > *(uint64_t *)args : > rxq->mbuf_initializer; > const uint64x2_t data_off = flags & NIX_RX_VWQE_F ? > - vdupq_n_u64(0x80ULL) : > - vdupq_n_u64(rxq->data_off); > + vdupq_n_u64(RTE_PKTMBUF_HEADROOM) : > + vdupq_n_u64(rxq->data_off); > const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask; > const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata; > const uintptr_t desc = flags & NIX_RX_VWQE_F ? 0 : rxq->desc; > @@ -687,6 +687,12 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, > cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); > > if (!(flags & NIX_RX_VWQE_F)) { > + /* Get NIX_RX_SG_S for size and buffer pointer */ > + cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags)); > + cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags)); > + cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags)); > + cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags)); > + > /* Extract mbuf from NIX_RX_SG_S */ > mbuf01 = vzip2q_u64(cq0_w8, cq1_w8); > mbuf23 = vzip2q_u64(cq2_w8, cq3_w8); > @@ -705,21 +711,22 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, > mbuf2 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 0); > mbuf3 = (struct rte_mbuf *)vgetq_lane_u64(mbuf23, 1); > > - /* Mask to get packet len from NIX_RX_SG_S */ > - const uint8x16_t shuf_msk = { > - 0xFF, 0xFF, /* pkt_type set as unknown */ > - 0xFF, 0xFF, /* pkt_type set as unknown */ > - 0, 1, /* octet 1~0, low 16 bits pkt_len */ > - 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ > - 0, 1, /* octet 1~0, 16 bits data_len */ > - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; > - > - /* Form the rx_descriptor_fields1 with pkt_len and data_len */ > - f0 = vqtbl1q_u8(cq0_w8, shuf_msk); > - f1 = vqtbl1q_u8(cq1_w8, shuf_msk); > - f2 = vqtbl1q_u8(cq2_w8, shuf_msk); > - f3 = vqtbl1q_u8(cq3_w8, shuf_msk); > + if (!(flags & NIX_RX_VWQE_F)) { > + /* Mask to get packet len from NIX_RX_SG_S */ > + const uint8x16_t shuf_msk = { > + 0xFF, 0xFF, /* pkt_type set as unknown */ > + 0xFF, 0xFF, /* pkt_type set as unknown */ > + 0, 1, /* octet 1~0, low 16 bits pkt_len */ > + 0xFF, 0xFF, /* skip high 16it pkt_len, zero out */ > + 0, 1, /* octet 1~0, 16 bits data_len */ > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; > > + /* Form the rx_descriptor_fields1 with pkt_len and data_len */ > + f0 = vqtbl1q_u8(cq0_w8, shuf_msk); > + f1 = vqtbl1q_u8(cq1_w8, shuf_msk); > + f2 = vqtbl1q_u8(cq2_w8, shuf_msk); > + f3 = vqtbl1q_u8(cq3_w8, shuf_msk); > + } > if (flags & NIX_RX_OFFLOAD_SECURITY_F) { > /* Prefetch probable CPT parse header area */ > rte_prefetch_non_temporal(RTE_PTR_ADD(mbuf0, d_off)); > @@ -731,12 +738,42 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, > /* Load CQE word0 and word 1 */ > const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags); > const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 8, flags); > + const uint64_t cq0_w2 = *CQE_PTR_OFF(cq0, 0, 16, flags); > const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags); > const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 8, flags); > + const uint64_t cq1_w2 = *CQE_PTR_OFF(cq0, 1, 16, flags); > const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags); > const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 8, flags); > + const uint64_t cq2_w2 = *CQE_PTR_OFF(cq0, 2, 16, flags); > const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags); > const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 8, flags); > + const uint64_t cq3_w2 = *CQE_PTR_OFF(cq0, 3, 16, flags); > + > + if (flags & NIX_RX_VWQE_F) { > + uint16_t psize0, psize1, psize2, psize3; > + > + psize0 = (cq0_w2 & 0xFFFF) + 1; > + psize1 = (cq1_w2 & 0xFFFF) + 1; > + psize2 = (cq2_w2 & 0xFFFF) + 1; > + psize3 = (cq3_w2 & 0xFFFF) + 1; > + > + f0 = vdupq_n_u64(0); > + f1 = vdupq_n_u64(0); > + f2 = vdupq_n_u64(0); > + f3 = vdupq_n_u64(0); > + > + f0 = vsetq_lane_u16(psize0, f0, 2); > + f0 = vsetq_lane_u16(psize0, f0, 4); > + > + f1 = vsetq_lane_u16(psize1, f1, 2); > + f1 = vsetq_lane_u16(psize1, f1, 4); > + > + f2 = vsetq_lane_u16(psize2, f2, 2); > + f2 = vsetq_lane_u16(psize2, f2, 4); > + > + f3 = vsetq_lane_u16(psize3, f3, 2); > + f3 = vsetq_lane_u16(psize3, f3, 4); > + } > > if (flags & NIX_RX_OFFLOAD_RSS_F) { > /* Fill rss in the rx_descriptor_fields1 */ > @@ -805,10 +842,6 @@ cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts, > } > > if (flags & NIX_RX_OFFLOAD_VLAN_STRIP_F) { > - uint64_t cq0_w2 = *(uint64_t *)(cq0 + CQE_SZ(0) + 16); > - uint64_t cq1_w2 = *(uint64_t *)(cq0 + CQE_SZ(1) + 16); > - uint64_t cq2_w2 = *(uint64_t *)(cq0 + CQE_SZ(2) + 16); > - uint64_t cq3_w2 = *(uint64_t *)(cq0 + CQE_SZ(3) + 16); > > ol_flags0 = nix_vlan_update(cq0_w2, ol_flags0, &f0); > ol_flags1 = nix_vlan_update(cq1_w2, ol_flags1, &f1); > -- > 2.35.1 > ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2022-02-24 20:40 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-02-24 13:52 [PATCH 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula 2022-02-24 13:52 ` [PATCH 2/2] event/cnxk: align perfetchs to CN10K cache model pbhagavatula 2022-02-24 16:10 ` [PATCH v2 1/2] net/cnxk: optimize Rx pktsize extraction pbhagavatula 2022-02-24 16:10 ` [PATCH v2 2/2] net/cnxk: align perfetchs to CN10K cache model pbhagavatula 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction jerinj 2022-02-24 18:40 ` [dpdk-dev] [PATCH v3 2/2] net/cnxk: align perfetchs to CN10K cache model jerinj 2022-02-24 20:39 ` [dpdk-dev] [PATCH v3 1/2] net/cnxk: optimize Rx packet size extraction Jerin Jacob
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).