This series is to optimize the i40e vPMD performance on aarch64. The patches were benchmarked by running the following command on Marvell ThunderX2 and Arm N1SDP and showed positive performance results. sudo ./build/app/testpmd -l 1,3 -w 0001:01:00.0 -w 0001:01:00.1 --master-lcore 1 -- -i --rxq=4 --txq=4 --nb-cores=1 --nb-ports=2 -a Gavin Hu (3): net/i40e: relax barrier in the Tx fastpath of vPMD net/i40e: restrict pointer aliasing for neon vec net/i40e: auto-vectorization to speed up Tx free drivers/net/i40e/i40e_rxtx_vec_common.h | 5 +++++ drivers/net/i40e/i40e_rxtx_vec_neon.c | 24 +++++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) -- 2.17.1
To keep ordering of mixed accesses, rte_cio is sufficient. The rte_io barrier inside the I40E_PCI_REG_WRITE is overkill.[1] This patch fixes by replacing with just sufficient barriers in the normal PMD and vPMD. It showed 7% performance uplift on ThunderX2 and 4% on Arm N1SDP. The test case is the RFC2544 zero-loss test running testpmd. [1] http://inbox.dpdk.org/dev/CALBAE1M-ezVWCjqCZDBw+MMDEC4O9 qf0Kpn89EMdGDajepKoZQ@mail.gmail.com Fixes: 4861cde46116 ("i40e: new poll mode driver") Cc: stable@dpdk.org Signed-off-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/i40e/i40e_rxtx_vec_neon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index deb185fe2..4376d8911 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -72,8 +72,9 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); + rte_cio_wmb(); /* Update the tail pointer on the NIC */ - I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id); + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); } static inline void @@ -564,7 +565,8 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, txq->tx_tail = tx_id; - I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); + rte_cio_wmb(); + I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id); return nb_pkts; } -- 2.17.1
restrict pointer aliasing to optimize the code generated. The patch showed ~3% performnace uplift on Arm N1SDP platform, and no degradation on ThunderX2. The tet case is RFC2544 zero-loss L2 forwarding running testpmd. [1] https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Restricted-Pointers.html Signed-off-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Steve Capper <steve.capper@arm.com> --- drivers/net/i40e/i40e_rxtx_vec_neon.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index 4376d8911..405bd82df 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -172,8 +172,8 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4], #define I40E_UINT16_BIT (CHAR_BIT * sizeof(uint16_t)) static inline void -desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts, - uint32_t *ptype_tbl) +desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **restrict rx_pkts, + uint32_t *restrict ptype_tbl) { int i; uint8_t ptype; @@ -194,8 +194,8 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts, * numbers of DD bits */ static inline uint16_t -_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, - uint16_t nb_pkts, uint8_t *split_packet) +_recv_raw_pkts_vec(struct i40e_rx_queue *restrict rxq, struct rte_mbuf + **restrict rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; @@ -432,7 +432,7 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, * numbers of DD bits */ uint16_t -i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, +i40e_recv_pkts_vec(void *restrict rx_queue, struct rte_mbuf **restrict rx_pkts, uint16_t nb_pkts) { return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); @@ -494,8 +494,8 @@ vtx1(volatile struct i40e_tx_desc *txdp, } static inline void -vtx(volatile struct i40e_tx_desc *txdp, - struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) +vtx(volatile struct i40e_tx_desc *txdp, struct rte_mbuf **pkt, + uint16_t nb_pkts, uint64_t flags) { int i; @@ -504,8 +504,8 @@ vtx(volatile struct i40e_tx_desc *txdp, } uint16_t -i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t nb_pkts) +i40e_xmit_fixed_burst_vec(void *restrict tx_queue, + struct rte_mbuf **restrict tx_pkts, uint16_t nb_pkts) { struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue; volatile struct i40e_tx_desc *txdp; -- 2.17.1
Tx mbuf free is a hotspot for i40e on aarch64, as there are no inter-loop dependencies, it is safe to enable auto-vectorization to speed up. This patch showed 2~3% performance lift on ThunderX2 and no degradation on Arm N1SDP. The test case is single core RFC2544 zero-loss test. Signed-off-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Steve Capper <steve.capper@arm.com> --- drivers/net/i40e/i40e_rxtx_vec_common.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h index 0e6ffa007..fc0fa45d4 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_common.h +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h @@ -98,6 +98,11 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) if (likely(m != NULL)) { free[0] = m; nb_free = 1; +#if defined(__clang__) +#pragma clang loop vectorize(assume_safety) +#elif defined(__GNUC__) +#pragma GCC ivdep +#endif for (i = 1; i < n; i++) { m = rte_pktmbuf_prefree_seg(txep[i].mbuf); if (likely(m != NULL)) { -- 2.17.1
On Fri, Mar 6, 2020 at 10:35 AM Gavin Hu <gavin.hu@arm.com> wrote: > > Tx mbuf free is a hotspot for i40e on aarch64, as there are no > inter-loop dependencies, it is safe to enable auto-vectorization > to speed up. > > This patch showed 2~3% performance lift on ThunderX2 and no degradation > on Arm N1SDP. The test case is single core RFC2544 zero-loss test. > > Signed-off-by: Gavin Hu <gavin.hu@arm.com> > Reviewed-by: Steve Capper <steve.capper@arm.com> > --- > drivers/net/i40e/i40e_rxtx_vec_common.h | 5 +++++ > 1 file changed, 5 insertions(+) > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h b/drivers/net/i40e/i40e_rxtx_vec_common.h > index 0e6ffa007..fc0fa45d4 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_common.h > +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h > @@ -98,6 +98,11 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) > if (likely(m != NULL)) { > free[0] = m; > nb_free = 1; > +#if defined(__clang__) > +#pragma clang loop vectorize(assume_safety) > +#elif defined(__GNUC__) > +#pragma GCC ivdep > +#endif IMO, It is better to abstract the compiler features (above compiler feature and __restrict__) as macros in rte_common.h or so. It will help to support other compilers(ICC or Windows) and enable them to have "changes" in one place. > for (i = 1; i < n; i++) { > m = rte_pktmbuf_prefree_seg(txep[i].mbuf); > if (likely(m != NULL)) { > -- > 2.17.1 >
06/03/2020 08:44, Jerin Jacob:
> On Fri, Mar 6, 2020 at 10:35 AM Gavin Hu <gavin.hu@arm.com> wrote:
> > --- a/drivers/net/i40e/i40e_rxtx_vec_common.h
> > +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h
> > @@ -98,6 +98,11 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq)
> > if (likely(m != NULL)) {
> > free[0] = m;
> > nb_free = 1;
> > +#if defined(__clang__)
> > +#pragma clang loop vectorize(assume_safety)
> > +#elif defined(__GNUC__)
> > +#pragma GCC ivdep
> > +#endif
>
> IMO, It is better to abstract the compiler features (above compiler
> feature and __restrict__) as macros in
> rte_common.h or so. It will help to support other compilers(ICC or
> Windows) and enable them to have "changes" in one place.
I agree with the need for common abstraction.
On Fri, Mar 6, 2020 at 10:35 AM Gavin Hu <gavin.hu@arm.com> wrote: > > To keep ordering of mixed accesses, rte_cio is sufficient. > The rte_io barrier inside the I40E_PCI_REG_WRITE is overkill.[1] > > This patch fixes by replacing with just sufficient barriers in the > normal PMD and vPMD. > > It showed 7% performance uplift on ThunderX2 and 4% on Arm N1SDP. > The test case is the RFC2544 zero-loss test running testpmd. > > [1] http://inbox.dpdk.org/dev/CALBAE1M-ezVWCjqCZDBw+MMDEC4O9 > qf0Kpn89EMdGDajepKoZQ@mail.gmail.com > > Fixes: 4861cde46116 ("i40e: new poll mode driver") > Cc: stable@dpdk.org > > Signed-off-by: Gavin Hu <gavin.hu@arm.com> Acked-by: Jerin Jacob <jerinj@marvell.com> > --- > drivers/net/i40e/i40e_rxtx_vec_neon.c | 6 ++++-- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c > index deb185fe2..4376d8911 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c > @@ -72,8 +72,9 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) > rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? > (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); > > + rte_cio_wmb(); > /* Update the tail pointer on the NIC */ > - I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id); > + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); > } > > static inline void > @@ -564,7 +565,8 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, > > txq->tx_tail = tx_id; > > - I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); > + rte_cio_wmb(); > + I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id); > > return nb_pkts; > } > -- > 2.17.1 >
Hi Jerin, > -----Original Message----- > From: Jerin Jacob <jerinjacobk@gmail.com> > Sent: Friday, March 6, 2020 3:45 PM > To: Gavin Hu <Gavin.Hu@arm.com> > Cc: dpdk-dev <dev@dpdk.org>; nd <nd@arm.com>; David Marchand > <david.marchand@redhat.com>; thomas@monjalon.net; > jerinj@marvell.com; Ye, Xiaolong <xiaolong.ye@intel.com>; Honnappa > Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang > <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>; Joyce Kong > <Joyce.Kong@arm.com>; Steve Capper <Steve.Capper@arm.com> > Subject: Re: [dpdk-dev] [PATCH v1 3/3] net/i40e: auto-vectorization to > speed up Tx free > > On Fri, Mar 6, 2020 at 10:35 AM Gavin Hu <gavin.hu@arm.com> wrote: > > > > Tx mbuf free is a hotspot for i40e on aarch64, as there are no > > inter-loop dependencies, it is safe to enable auto-vectorization > > to speed up. > > > > This patch showed 2~3% performance lift on ThunderX2 and no > degradation > > on Arm N1SDP. The test case is single core RFC2544 zero-loss test. > > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com> > > Reviewed-by: Steve Capper <steve.capper@arm.com> > > --- > > drivers/net/i40e/i40e_rxtx_vec_common.h | 5 +++++ > > 1 file changed, 5 insertions(+) > > > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h > b/drivers/net/i40e/i40e_rxtx_vec_common.h > > index 0e6ffa007..fc0fa45d4 100644 > > --- a/drivers/net/i40e/i40e_rxtx_vec_common.h > > +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h > > @@ -98,6 +98,11 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) > > if (likely(m != NULL)) { > > free[0] = m; > > nb_free = 1; > > +#if defined(__clang__) > > +#pragma clang loop vectorize(assume_safety) > > +#elif defined(__GNUC__) > > +#pragma GCC ivdep > > +#endif > > IMO, It is better to abstract the compiler features (above compiler > feature and __restrict__) as macros in > rte_common.h or so. It will help to support other compilers(ICC or > Windows) and enable them to have "changes" in one place. How about defining RTE_LOOP_AUTO_VECTORIZATION in the rte_common.h? #if defined(__clang__) define RTE_LOOP_AUTO_VECTORIZATION \ #pragma clang loop vectorize(assume_safety) #elif defined(__GNUC__) define RTE_LOOP_AUTO_VECTORIZATION \ #pragma GCC ivdep #else define RTE_LOOP_AUTO_VECTORIZATION #endif If you agree, I will submit a v2. Thanks for your comments! /Gavin > > > > > for (i = 1; i < n; i++) { > > m = rte_pktmbuf_prefree_seg(txep[i].mbuf); > > if (likely(m != NULL)) { > > -- > > 2.17.1 > >
On Sat, Mar 7, 2020 at 8:34 PM Gavin Hu <Gavin.Hu@arm.com> wrote: > > Hi Jerin, > > > -----Original Message----- > > From: Jerin Jacob <jerinjacobk@gmail.com> > > Sent: Friday, March 6, 2020 3:45 PM > > To: Gavin Hu <Gavin.Hu@arm.com> > > Cc: dpdk-dev <dev@dpdk.org>; nd <nd@arm.com>; David Marchand > > <david.marchand@redhat.com>; thomas@monjalon.net; > > jerinj@marvell.com; Ye, Xiaolong <xiaolong.ye@intel.com>; Honnappa > > Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang > > <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>; Joyce Kong > > <Joyce.Kong@arm.com>; Steve Capper <Steve.Capper@arm.com> > > Subject: Re: [dpdk-dev] [PATCH v1 3/3] net/i40e: auto-vectorization to > > speed up Tx free > > > > On Fri, Mar 6, 2020 at 10:35 AM Gavin Hu <gavin.hu@arm.com> wrote: > > > > > > Tx mbuf free is a hotspot for i40e on aarch64, as there are no > > > inter-loop dependencies, it is safe to enable auto-vectorization > > > to speed up. > > > > > > This patch showed 2~3% performance lift on ThunderX2 and no > > degradation > > > on Arm N1SDP. The test case is single core RFC2544 zero-loss test. > > > > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com> > > > Reviewed-by: Steve Capper <steve.capper@arm.com> > > > --- > > > drivers/net/i40e/i40e_rxtx_vec_common.h | 5 +++++ > > > 1 file changed, 5 insertions(+) > > > > > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h > > b/drivers/net/i40e/i40e_rxtx_vec_common.h > > > index 0e6ffa007..fc0fa45d4 100644 > > > --- a/drivers/net/i40e/i40e_rxtx_vec_common.h > > > +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h > > > @@ -98,6 +98,11 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) > > > if (likely(m != NULL)) { > > > free[0] = m; > > > nb_free = 1; > > > +#if defined(__clang__) > > > +#pragma clang loop vectorize(assume_safety) > > > +#elif defined(__GNUC__) > > > +#pragma GCC ivdep > > > +#endif > > > > IMO, It is better to abstract the compiler features (above compiler > > feature and __restrict__) as macros in > > rte_common.h or so. It will help to support other compilers(ICC or > > Windows) and enable them to have "changes" in one place. > > How about defining RTE_LOOP_AUTO_VECTORIZATION in the rte_common.h? Other compiler stuff in rte_common.h are starting with __rte in small letter(__rte_packed, __rte_unused) etc. I think, a better name would be __rte_loop_auto_vectorize or so. No strong opinion for the name though. # Probably it is worth checking and add performance result of x86 testing in git commit as well as it is common code. > #if defined(__clang__) > define RTE_LOOP_AUTO_VECTORIZATION \ > #pragma clang loop vectorize(assume_safety) > #elif defined(__GNUC__) > define RTE_LOOP_AUTO_VECTORIZATION \ > #pragma GCC ivdep > #else > define RTE_LOOP_AUTO_VECTORIZATION > #endif > If you agree, I will submit a v2. Thanks for your comments! > /Gavin > > > > > > > > > for (i = 1; i < n; i++) { > > > m = rte_pktmbuf_prefree_seg(txep[i].mbuf); > > > if (likely(m != NULL)) { > > > -- > > > 2.17.1 > > >
Hi Jerin, > -----Original Message----- > From: Jerin Jacob <jerinjacobk@gmail.com> > Sent: Monday, March 9, 2020 3:36 PM > To: Gavin Hu <Gavin.Hu@arm.com> > Cc: dpdk-dev <dev@dpdk.org>; nd <nd@arm.com>; David Marchand > <david.marchand@redhat.com>; thomas@monjalon.net; jerinj@marvell.com; > Ye, Xiaolong <xiaolong.ye@intel.com>; Honnappa Nagarahalli > <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang > <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>; Joyce Kong > <Joyce.Kong@arm.com>; Steve Capper <Steve.Capper@arm.com> > Subject: Re: [dpdk-dev] [PATCH v1 3/3] net/i40e: auto-vectorization to speed > up Tx free > > On Sat, Mar 7, 2020 at 8:34 PM Gavin Hu <Gavin.Hu@arm.com> wrote: > > > > Hi Jerin, > > > > > -----Original Message----- > > > From: Jerin Jacob <jerinjacobk@gmail.com> > > > Sent: Friday, March 6, 2020 3:45 PM > > > To: Gavin Hu <Gavin.Hu@arm.com> > > > Cc: dpdk-dev <dev@dpdk.org>; nd <nd@arm.com>; David Marchand > > > <david.marchand@redhat.com>; thomas@monjalon.net; > > > jerinj@marvell.com; Ye, Xiaolong <xiaolong.ye@intel.com>; Honnappa > > > Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang > > > <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>; Joyce Kong > > > <Joyce.Kong@arm.com>; Steve Capper <Steve.Capper@arm.com> > > > Subject: Re: [dpdk-dev] [PATCH v1 3/3] net/i40e: auto-vectorization to > > > speed up Tx free > > > > > > On Fri, Mar 6, 2020 at 10:35 AM Gavin Hu <gavin.hu@arm.com> wrote: > > > > > > > > Tx mbuf free is a hotspot for i40e on aarch64, as there are no > > > > inter-loop dependencies, it is safe to enable auto-vectorization > > > > to speed up. > > > > > > > > This patch showed 2~3% performance lift on ThunderX2 and no > > > degradation > > > > on Arm N1SDP. The test case is single core RFC2544 zero-loss test. > > > > > > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com> > > > > Reviewed-by: Steve Capper <steve.capper@arm.com> > > > > --- > > > > drivers/net/i40e/i40e_rxtx_vec_common.h | 5 +++++ > > > > 1 file changed, 5 insertions(+) > > > > > > > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_common.h > > > b/drivers/net/i40e/i40e_rxtx_vec_common.h > > > > index 0e6ffa007..fc0fa45d4 100644 > > > > --- a/drivers/net/i40e/i40e_rxtx_vec_common.h > > > > +++ b/drivers/net/i40e/i40e_rxtx_vec_common.h > > > > @@ -98,6 +98,11 @@ i40e_tx_free_bufs(struct i40e_tx_queue *txq) > > > > if (likely(m != NULL)) { > > > > free[0] = m; > > > > nb_free = 1; > > > > +#if defined(__clang__) > > > > +#pragma clang loop vectorize(assume_safety) > > > > +#elif defined(__GNUC__) > > > > +#pragma GCC ivdep > > > > +#endif > > > > > > IMO, It is better to abstract the compiler features (above compiler > > > feature and __restrict__) as macros in > > > rte_common.h or so. It will help to support other compilers(ICC or > > > Windows) and enable them to have "changes" in one place. > > > > How about defining RTE_LOOP_AUTO_VECTORIZATION in the > rte_common.h? > > Other compiler stuff in rte_common.h are starting with __rte in small > letter(__rte_packed, __rte_unused) etc. > I think, a better name would be __rte_loop_auto_vectorize or so. > No strong opinion for the name though. > > # Probably it is worth checking and add performance result of x86 > testing in git commit as well as it > is common code. Okay, I will do it. > > > > #if defined(__clang__) > > define RTE_LOOP_AUTO_VECTORIZATION \ > > #pragma clang loop vectorize(assume_safety) > > #elif defined(__GNUC__) > > define RTE_LOOP_AUTO_VECTORIZATION \ > > #pragma GCC ivdep > > #else > > define RTE_LOOP_AUTO_VECTORIZATION > > #endif > > If you agree, I will submit a v2. Thanks for your comments! > > /Gavin > > > > > > > > > > > > > for (i = 1; i < n; i++) { > > > > m = rte_pktmbuf_prefree_seg(txep[i].mbuf); > > > > if (likely(m != NULL)) { > > > > -- > > > > 2.17.1 > > > >
V2: - drop the 3/3 auto-vectorization patch from this series, it requires more rework and will submit in a separate patch. This series is to optimize the i40e NEON vPMD performance on aarch64. The patches were benchmarked by running the following command on Marvell ThunderX2 and Arm N1SDP and showed positive performance results. sudo ./build/app/testpmd -l 1,3 -w 0001:01:00.0 -w 0001:01:00.1 --master-lcore 1 -- -i --rxq=4 --txq=4 --nb-cores=1 --nb-ports=2 -a Gavin Hu (2): net/i40e: relax barrier in Tx fastpath for NEON vPMD net/i40e: restrict pointer aliasing for NEON vPMD drivers/net/i40e/i40e_rxtx_vec_neon.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) -- 2.17.1
To keep ordering of mixed accesses, 'DMB OSH' is sufficient. 'DSB' inside the I40E_PCI_REG_WRITE is overkill.[1] This patch fixes by replacing with just sufficient barriers in the normal PMD and vPMD. It showed 7% performance uplift on ThunderX2 and 4% on Arm N1SDP. The test case is the RFC2544 zero-loss test running testpmd. [1] http://inbox.dpdk.org/dev/CALBAE1M-ezVWCjqCZDBw+MMDEC4O9 qf0Kpn89EMdGDajepKoZQ@mail.gmail.com Fixes: 4861cde46116 ("i40e: new poll mode driver") Cc: stable@dpdk.org Signed-off-by: Gavin Hu <gavin.hu@arm.com> --- drivers/net/i40e/i40e_rxtx_vec_neon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index deb185fe2..4376d8911 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -72,8 +72,9 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); + rte_cio_wmb(); /* Update the tail pointer on the NIC */ - I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id); + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); } static inline void @@ -564,7 +565,8 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, txq->tx_tail = tx_id; - I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); + rte_cio_wmb(); + I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id); return nb_pkts; } -- 2.17.1
restrict pointer aliasing to optimize the code generated. The patch showed ~3% performnace uplift on Arm N1SDP platform, and no degradation on ThunderX2. The tet case is RFC2544 zero-loss L2 forwarding running testpmd. [1] https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Restricted-Pointers.html Signed-off-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Steve Capper <steve.capper@arm.com> --- drivers/net/i40e/i40e_rxtx_vec_neon.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index 4376d8911..405bd82df 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -172,8 +172,8 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4], #define I40E_UINT16_BIT (CHAR_BIT * sizeof(uint16_t)) static inline void -desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts, - uint32_t *ptype_tbl) +desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **restrict rx_pkts, + uint32_t *restrict ptype_tbl) { int i; uint8_t ptype; @@ -194,8 +194,8 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts, * numbers of DD bits */ static inline uint16_t -_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, - uint16_t nb_pkts, uint8_t *split_packet) +_recv_raw_pkts_vec(struct i40e_rx_queue *restrict rxq, struct rte_mbuf + **restrict rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; @@ -432,7 +432,7 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, * numbers of DD bits */ uint16_t -i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, +i40e_recv_pkts_vec(void *restrict rx_queue, struct rte_mbuf **restrict rx_pkts, uint16_t nb_pkts) { return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); @@ -494,8 +494,8 @@ vtx1(volatile struct i40e_tx_desc *txdp, } static inline void -vtx(volatile struct i40e_tx_desc *txdp, - struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) +vtx(volatile struct i40e_tx_desc *txdp, struct rte_mbuf **pkt, + uint16_t nb_pkts, uint64_t flags) { int i; @@ -504,8 +504,8 @@ vtx(volatile struct i40e_tx_desc *txdp, } uint16_t -i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t nb_pkts) +i40e_xmit_fixed_burst_vec(void *restrict tx_queue, + struct rte_mbuf **restrict tx_pkts, uint16_t nb_pkts) { struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue; volatile struct i40e_tx_desc *txdp; -- 2.17.1
V3: - fix the 'Fixes' line warning. V2: - drop the 3/3 auto-vectorization patch from this series, it requires more rework and will submit in a separate patch. This series is to optimize the i40e NEON vPMD performance on aarch64. The patches were benchmarked by running the following command on Marvell ThunderX2 and Arm N1SDP and showed positive performance results. sudo ./build/app/testpmd -l 1,3 -w 0001:01:00.0 -w 0001:01:00.1 --master-lcore 1 -- -i --rxq=4 --txq=4 --nb-cores=1 --nb-ports=2 -a Gavin Hu (2): net/i40e: relax barrier in Tx fastpath for NEON vPMD net/i40e: restrict pointer aliasing for NEON vPMD drivers/net/i40e/i40e_rxtx_vec_neon.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) -- 2.17.1
To keep ordering of mixed accesses, 'DMB OSH' is sufficient. 'DSB' inside the I40E_PCI_REG_WRITE is overkill.[1] This patch fixes by replacing with just sufficient barriers in the normal PMD and vPMD. It showed 7% performance uplift on ThunderX2 and 4% on Arm N1SDP. The test case is the RFC2544 zero-loss test running testpmd. [1] http://inbox.dpdk.org/dev/CALBAE1M-ezVWCjqCZDBw+MMDEC4O9 qf0Kpn89EMdGDajepKoZQ@mail.gmail.com Fixes: ae0eb310f253 ("net/i40e: implement vector PMD for ARM") Cc: stable@dpdk.org Signed-off-by: Gavin Hu <gavin.hu@arm.com> Acked-by: Jerin Jacob <jerinj@marvell.com> --- drivers/net/i40e/i40e_rxtx_vec_neon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index deb185fe2..4376d8911 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -72,8 +72,9 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq) rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); + rte_cio_wmb(); /* Update the tail pointer on the NIC */ - I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id); + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id); } static inline void @@ -564,7 +565,8 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, txq->tx_tail = tx_id; - I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail); + rte_cio_wmb(); + I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id); return nb_pkts; } -- 2.17.1
restrict pointer aliasing to optimize the code generated. The patch showed ~3% performnace uplift on Arm N1SDP platform, and no degradation on ThunderX2. The tet case is RFC2544 zero-loss L2 forwarding running testpmd. [1] https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Restricted-Pointers.html Signed-off-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Steve Capper <steve.capper@arm.com> --- drivers/net/i40e/i40e_rxtx_vec_neon.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c index 4376d8911..405bd82df 100644 --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c @@ -172,8 +172,8 @@ desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4], #define I40E_UINT16_BIT (CHAR_BIT * sizeof(uint16_t)) static inline void -desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts, - uint32_t *ptype_tbl) +desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **restrict rx_pkts, + uint32_t *restrict ptype_tbl) { int i; uint8_t ptype; @@ -194,8 +194,8 @@ desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **rx_pkts, * numbers of DD bits */ static inline uint16_t -_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, - uint16_t nb_pkts, uint8_t *split_packet) +_recv_raw_pkts_vec(struct i40e_rx_queue *restrict rxq, struct rte_mbuf + **restrict rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; @@ -432,7 +432,7 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, * numbers of DD bits */ uint16_t -i40e_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, +i40e_recv_pkts_vec(void *restrict rx_queue, struct rte_mbuf **restrict rx_pkts, uint16_t nb_pkts) { return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL); @@ -494,8 +494,8 @@ vtx1(volatile struct i40e_tx_desc *txdp, } static inline void -vtx(volatile struct i40e_tx_desc *txdp, - struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) +vtx(volatile struct i40e_tx_desc *txdp, struct rte_mbuf **pkt, + uint16_t nb_pkts, uint64_t flags) { int i; @@ -504,8 +504,8 @@ vtx(volatile struct i40e_tx_desc *txdp, } uint16_t -i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t nb_pkts) +i40e_xmit_fixed_burst_vec(void *restrict tx_queue, + struct rte_mbuf **restrict tx_pkts, uint16_t nb_pkts) { struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue; volatile struct i40e_tx_desc *txdp; -- 2.17.1
On 04/14, Gavin Hu wrote:
>V3:
>- fix the 'Fixes' line warning.
>V2:
>- drop the 3/3 auto-vectorization patch from this series, it requires
> more rework and will submit in a separate patch.
>
>This series is to optimize the i40e NEON vPMD performance on aarch64.
>
>The patches were benchmarked by running the following command on Marvell
>ThunderX2 and Arm N1SDP and showed positive performance results.
>
>sudo ./build/app/testpmd -l 1,3 -w 0001:01:00.0 -w 0001:01:00.1
>--master-lcore 1 -- -i --rxq=4 --txq=4 --nb-cores=1 --nb-ports=2 -a
>
>Gavin Hu (2):
> net/i40e: relax barrier in Tx fastpath for NEON vPMD
> net/i40e: restrict pointer aliasing for NEON vPMD
>
> drivers/net/i40e/i40e_rxtx_vec_neon.c | 24 +++++++++++++-----------
> 1 file changed, 13 insertions(+), 11 deletions(-)
>
>--
>2.17.1
>
Applied to dpdk-next-net-intel, Thanks.
On 4/13/2020 4:56 PM, Gavin Hu wrote:
> restrict pointer aliasing to optimize the code generated.
>
> The patch showed ~3% performnace uplift on Arm N1SDP platform, and no
> degradation on ThunderX2. The tet case is RFC2544 zero-loss L2
> forwarding running testpmd.
>
> [1] https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Restricted-Pointers.html
>
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
Hi Gavin,
Raslan supported following build error [1], this is blocking the next-net, can
you please check the error, and send a fix for it?
[1]
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:175:65: error: expected
';', ',' or ')' before 'rx_pkts'
desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **restrict rx_pkts,
^
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:197:51: error: expected
';', ',' or ')' before 'rxq'
_recv_raw_pkts_vec(struct i40e_rx_queue *restrict rxq, struct rte_mbuf
^
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:435:35: error: expected
';', ',' or ')' before 'rx_queue'
i40e_recv_pkts_vec(void *restrict rx_queue, struct rte_mbuf **restrict rx_pkts,
^
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c: In function
'i40e_recv_scattered_pkts_vec':
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:456:2: error: implicit
declaration of function '_recv_raw_pkts_vec' [-Werror=implicit-function-declaration]
uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
^
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:456:2: error: nested extern
declaration of '_recv_raw_pkts_vec' [-Werror=nested-externs]
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c: At top level:
/download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:507:42: error: expected
';', ',' or ')' before 'tx_queue'
i40e_xmit_fixed_burst_vec(void *restrict tx_queue,
gcc version 4.8.5 20150623 (Red Hat 4.8.5-28) (GCC)
seen this with T=arm64-bluefield-linuxapp-gcc and T=arm64-armv8a-linux-gcc
using make
Hi Ferruh, > -----Original Message----- > From: Ferruh Yigit <ferruh.yigit@intel.com> > Sent: Monday, April 20, 2020 10:51 PM > To: Gavin Hu <Gavin.Hu@arm.com>; dev@dpdk.org > Cc: nd <nd@arm.com>; david.marchand@redhat.com; > thomas@monjalon.net; jerinj@marvell.com; xiaolong.ye@intel.com; > Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang > <Ruifeng.Wang@arm.com>; Phil Yang <Phil.Yang@arm.com>; Joyce Kong > <Joyce.Kong@arm.com>; Steve Capper <Steve.Capper@arm.com>; Raslan > Darawsheh <rasland@mellanox.com> > Subject: Re: [dpdk-dev] [PATCH v2 2/2] net/i40e: restrict pointer aliasing for > NEON vPMD > > On 4/13/2020 4:56 PM, Gavin Hu wrote: > > restrict pointer aliasing to optimize the code generated. > > > > The patch showed ~3% performnace uplift on Arm N1SDP platform, and > no > > degradation on ThunderX2. The tet case is RFC2544 zero-loss L2 > > forwarding running testpmd. > > > > [1] https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Restricted-Pointers.html > > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com> > > Reviewed-by: Steve Capper <steve.capper@arm.com> > > Hi Gavin, > > Raslan supported following build error [1], this is blocking the next-net, can > you please check the error, and send a fix for it? > > [1] > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:175:65: error: > expected > ';', ',' or ')' before 'rx_pkts' > desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **restrict rx_pkts, > ^ > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:197:51: error: > expected > ';', ',' or ')' before 'rxq' > _recv_raw_pkts_vec(struct i40e_rx_queue *restrict rxq, struct rte_mbuf > ^ > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:435:35: error: > expected > ';', ',' or ')' before 'rx_queue' > i40e_recv_pkts_vec(void *restrict rx_queue, struct rte_mbuf **restrict > rx_pkts, > ^ > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c: In function > 'i40e_recv_scattered_pkts_vec': > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:456:2: error: > implicit > declaration of function '_recv_raw_pkts_vec' [-Werror=implicit-function- > declaration] > uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts, > ^ > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:456:2: error: > nested extern > declaration of '_recv_raw_pkts_vec' [-Werror=nested-externs] > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c: At top level: > /download/dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c:507:42: error: > expected > ';', ',' or ')' before 'tx_queue' > i40e_xmit_fixed_burst_vec(void *restrict tx_queue, > > > gcc version 4.8.5 20150623 (Red Hat 4.8.5-28) (GCC) > seen this with T=arm64-bluefield-linuxapp-gcc and T=arm64-armv8a-linux- > gcc > using make Sorry for the leakage, I submitted a patch to fix this: http://patches.dpdk.org/patch/69006/ It was verified with gcc 4.8.5 and gcc-8, as well as meson + clang.