DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v1 1/1] net/octeontx2: allow vec to process pkts not multiple of 4
@ 2019-12-20 13:02 vattunuru
  2020-01-14  4:06 ` Jerin Jacob
  0 siblings, 1 reply; 2+ messages in thread
From: vattunuru @ 2019-12-20 13:02 UTC (permalink / raw)
  To: dev; +Cc: jerinj, ndabilpuram, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current vector mode implementation floor-aligns pkt count
with NIX_DESCS_PER_LOOP and process that many packets.

Patch addresses the case where pkt count modulo NIX_DESCS_PER_LOOP
is non-zero, after the vector mode processing, scalar routine is
used to process if there are any leftover packets. Scalar routine
is also used when descriptor head is about to wrap and turn out to
be unaligned.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
---
 drivers/net/octeontx2/otx2_rx.c | 18 ++++++++++++++----
 drivers/net/octeontx2/otx2_tx.c | 18 +++++++++++++-----
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/net/octeontx2/otx2_rx.c b/drivers/net/octeontx2/otx2_rx.c
index 48565db..8e6452a 100644
--- a/drivers/net/octeontx2/otx2_rx.c
+++ b/drivers/net/octeontx2/otx2_rx.c
@@ -130,16 +130,22 @@ nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 	const uintptr_t desc = rxq->desc;
 	uint8x16_t f0, f1, f2, f3;
 	uint32_t head = rxq->head;
+	uint16_t pkts_left;
 
 	pkts = nix_rx_nb_pkts(rxq, wdata, pkts, qmask);
+	pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
+
 	/* Packets has to be floor-aligned to NIX_DESCS_PER_LOOP */
 	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
 
 	while (packets < pkts) {
-		/* Get the CQ pointers, since the ring size is multiple of
-		 * 4, We can avoid checking the wrap around of head
-		 * value after the each access unlike scalar version.
-		 */
+		/* Exit loop if head is about to wrap and become unaligned */
+		if (((head + NIX_DESCS_PER_LOOP - 1) & qmask) <
+				NIX_DESCS_PER_LOOP) {
+			pkts_left += (pkts - packets);
+			break;
+		}
+
 		const uintptr_t cq0 = desc + CQE_SZ(head);
 
 		/* Prefetch N desc ahead */
@@ -301,6 +307,10 @@ nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 	/* Free all the CQs that we've processed */
 	otx2_write64((rxq->wdata | packets), rxq->cq_door);
 
+	if (unlikely(pkts_left))
+		packets += nix_recv_pkts(rx_queue, &rx_pkts[packets],
+					 pkts_left, flags);
+
 	return packets;
 }
 
diff --git a/drivers/net/octeontx2/otx2_tx.c b/drivers/net/octeontx2/otx2_tx.c
index fa53300..96be92a 100644
--- a/drivers/net/octeontx2/otx2_tx.c
+++ b/drivers/net/octeontx2/otx2_tx.c
@@ -97,7 +97,7 @@ nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 #define NIX_DESCS_PER_LOOP	4
 static __rte_always_inline uint16_t
 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-		     uint16_t pkts, const uint16_t flags)
+		     uint16_t pkts, uint64_t *cmd, const uint16_t flags)
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
@@ -118,11 +118,13 @@ nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t cmd20, cmd21;
 	uint64x2_t cmd30, cmd31;
 	uint64_t lmt_status, i;
-
-	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+	uint16_t pkts_left;
 
 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
 
+	pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
+	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;
 
@@ -929,17 +931,21 @@ nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		} while (lmt_status == 0);
 	}
 
+	if (unlikely(pkts_left))
+		pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
+
 	return pkts;
 }
 
 #else
 static __rte_always_inline uint16_t
 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-		     uint16_t pkts, const uint16_t flags)
+		     uint16_t pkts, uint64_t *cmd, const uint16_t flags)
 {
 	RTE_SET_USED(tx_queue);
 	RTE_SET_USED(tx_pkts);
 	RTE_SET_USED(pkts);
+	RTE_SET_USED(cmd);
 	RTE_SET_USED(flags);
 	return 0;
 }
@@ -985,12 +991,14 @@ static uint16_t __rte_noinline	__hot					\
 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,				\
 			struct rte_mbuf **tx_pkts, uint16_t pkts)	\
 {									\
+	uint64_t cmd[sz];						\
+									\
 	/* VLAN, TSTMP, TSO is not supported by vec */			\
 	if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||			\
 	    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||			\
 	    (flags) & NIX_TX_OFFLOAD_TSO_F)				\
 		return 0;						\
-	return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, (flags));	\
+	return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
 }
 
 NIX_TX_FASTPATH_MODES
-- 
2.8.4


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [dpdk-dev] [PATCH v1 1/1] net/octeontx2: allow vec to process pkts not multiple of 4
  2019-12-20 13:02 [dpdk-dev] [PATCH v1 1/1] net/octeontx2: allow vec to process pkts not multiple of 4 vattunuru
@ 2020-01-14  4:06 ` Jerin Jacob
  0 siblings, 0 replies; 2+ messages in thread
From: Jerin Jacob @ 2020-01-14  4:06 UTC (permalink / raw)
  To: Vamsi Attunuru, Ferruh Yigit; +Cc: dpdk-dev, Jerin Jacob, Nithin Dabilpuram

On Fri, Dec 20, 2019 at 6:33 PM <vattunuru@marvell.com> wrote:
>
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Current vector mode implementation floor-aligns pkt count
> with NIX_DESCS_PER_LOOP and process that many packets.
>
> Patch addresses the case where pkt count modulo NIX_DESCS_PER_LOOP
> is non-zero, after the vector mode processing, scalar routine is
> used to process if there are any leftover packets. Scalar routine
> is also used when descriptor head is about to wrap and turn out to
> be unaligned.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>


Applied to dpdk-next-net-mrvl/master. Thanks


> ---
>  drivers/net/octeontx2/otx2_rx.c | 18 ++++++++++++++----
>  drivers/net/octeontx2/otx2_tx.c | 18 +++++++++++++-----
>  2 files changed, 27 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/octeontx2/otx2_rx.c b/drivers/net/octeontx2/otx2_rx.c
> index 48565db..8e6452a 100644
> --- a/drivers/net/octeontx2/otx2_rx.c
> +++ b/drivers/net/octeontx2/otx2_rx.c
> @@ -130,16 +130,22 @@ nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
>         const uintptr_t desc = rxq->desc;
>         uint8x16_t f0, f1, f2, f3;
>         uint32_t head = rxq->head;
> +       uint16_t pkts_left;
>
>         pkts = nix_rx_nb_pkts(rxq, wdata, pkts, qmask);
> +       pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
> +
>         /* Packets has to be floor-aligned to NIX_DESCS_PER_LOOP */
>         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
>
>         while (packets < pkts) {
> -               /* Get the CQ pointers, since the ring size is multiple of
> -                * 4, We can avoid checking the wrap around of head
> -                * value after the each access unlike scalar version.
> -                */
> +               /* Exit loop if head is about to wrap and become unaligned */
> +               if (((head + NIX_DESCS_PER_LOOP - 1) & qmask) <
> +                               NIX_DESCS_PER_LOOP) {
> +                       pkts_left += (pkts - packets);
> +                       break;
> +               }
> +
>                 const uintptr_t cq0 = desc + CQE_SZ(head);
>
>                 /* Prefetch N desc ahead */
> @@ -301,6 +307,10 @@ nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
>         /* Free all the CQs that we've processed */
>         otx2_write64((rxq->wdata | packets), rxq->cq_door);
>
> +       if (unlikely(pkts_left))
> +               packets += nix_recv_pkts(rx_queue, &rx_pkts[packets],
> +                                        pkts_left, flags);
> +
>         return packets;
>  }
>
> diff --git a/drivers/net/octeontx2/otx2_tx.c b/drivers/net/octeontx2/otx2_tx.c
> index fa53300..96be92a 100644
> --- a/drivers/net/octeontx2/otx2_tx.c
> +++ b/drivers/net/octeontx2/otx2_tx.c
> @@ -97,7 +97,7 @@ nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
>  #define NIX_DESCS_PER_LOOP     4
>  static __rte_always_inline uint16_t
>  nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                    uint16_t pkts, const uint16_t flags)
> +                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
>  {
>         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
>         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
> @@ -118,11 +118,13 @@ nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>         uint64x2_t cmd20, cmd21;
>         uint64x2_t cmd30, cmd31;
>         uint64_t lmt_status, i;
> -
> -       pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> +       uint16_t pkts_left;
>
>         NIX_XMIT_FC_OR_RETURN(txq, pkts);
>
> +       pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
> +       pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> +
>         /* Reduce the cached count */
>         txq->fc_cache_pkts -= pkts;
>
> @@ -929,17 +931,21 @@ nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
>                 } while (lmt_status == 0);
>         }
>
> +       if (unlikely(pkts_left))
> +               pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
> +
>         return pkts;
>  }
>
>  #else
>  static __rte_always_inline uint16_t
>  nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> -                    uint16_t pkts, const uint16_t flags)
> +                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
>  {
>         RTE_SET_USED(tx_queue);
>         RTE_SET_USED(tx_pkts);
>         RTE_SET_USED(pkts);
> +       RTE_SET_USED(cmd);
>         RTE_SET_USED(flags);
>         return 0;
>  }
> @@ -985,12 +991,14 @@ static uint16_t __rte_noinline    __hot                                   \
>  otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                                \
>                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
>  {                                                                      \
> +       uint64_t cmd[sz];                                               \
> +                                                                       \
>         /* VLAN, TSTMP, TSO is not supported by vec */                  \
>         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
>             (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
>             (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
>                 return 0;                                               \
> -       return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, (flags));  \
> +       return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
>  }
>
>  NIX_TX_FASTPATH_MODES
> --
> 2.8.4
>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-01-14  4:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-20 13:02 [dpdk-dev] [PATCH v1 1/1] net/octeontx2: allow vec to process pkts not multiple of 4 vattunuru
2020-01-14  4:06 ` Jerin Jacob

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).