DPDK patches and discussions
 help / color / Atom feed
* [dpdk-dev] [PATCH v1] net/memif:  optimized with one-way barrier
@ 2019-08-26 11:00 Phil Yang
  2019-08-26 11:03 ` Phil Yang (Arm Technology China)
  2019-10-09  2:04 ` [dpdk-dev] [PATCH v2] " Phil Yang
  0 siblings, 2 replies; 9+ messages in thread
From: Phil Yang @ 2019-08-26 11:00 UTC (permalink / raw)
  To: jgrajcia, dev; +Cc: thomas, jerinj, Honnappa.Nagarahalli, damarion, nd

Using 'rte_mb' to synchronize the shared ring head/tail between producer
and consumer will stall the pipeline and damage performance on the weak
memory model platforms, such like aarch64. Meanwhile update the shared
ring head and tail are observable and ordered between CPUs on IA.

Optimized this full barrier with the one-way barrier can improve the
throughput. On aarch64 n1sdp server this patch make testpmd throughput
boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
 drivers/net/memif/memif.h         |  4 +--
 drivers/net/memif/rte_eth_memif.c | 53 +++++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/drivers/net/memif/memif.h b/drivers/net/memif/memif.h
index 3948b1f..a4d88c0 100644
--- a/drivers/net/memif/memif.h
+++ b/drivers/net/memif/memif.h
@@ -169,9 +169,9 @@ typedef struct {
 	uint32_t cookie;			/**< MEMIF_COOKIE */
 	uint16_t flags;				/**< flags */
 #define MEMIF_RING_FLAG_MASK_INT 1		/**< disable interrupt mode */
-	volatile uint16_t head;			/**< pointer to ring buffer head */
+	uint16_t head;			/**< pointer to ring buffer head */
 	MEMIF_CACHELINE_ALIGN_MARK(cacheline1);
-	volatile uint16_t tail;			/**< pointer to ring buffer tail */
+	uint16_t tail;			/**< pointer to ring buffer tail */
 	MEMIF_CACHELINE_ALIGN_MARK(cacheline2);
 	memif_desc_t desc[0];			/**< buffer descriptors */
 } memif_ring_t;
diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index a59f809..b1c871e 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -275,8 +275,14 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	ring_size = 1 << mq->log2_ring_size;
 	mask = ring_size - 1;
 
-	cur_slot = (type == MEMIF_RING_S2M) ? mq->last_head : mq->last_tail;
-	last_slot = (type == MEMIF_RING_S2M) ? ring->head : ring->tail;
+	if (type == MEMIF_RING_S2M) {
+		cur_slot = mq->last_head;
+		last_slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
+	} else {
+		cur_slot = mq->last_tail;
+		last_slot = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
+	}
+
 	if (cur_slot == last_slot)
 		goto refill;
 	n_slots = last_slot - cur_slot;
@@ -344,8 +350,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
 no_free_bufs:
 	if (type == MEMIF_RING_S2M) {
-		rte_mb();
-		ring->tail = cur_slot;
+		__atomic_store_n(&ring->tail, cur_slot, __ATOMIC_RELEASE);
 		mq->last_head = cur_slot;
 	} else {
 		mq->last_tail = cur_slot;
@@ -353,7 +358,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
 refill:
 	if (type == MEMIF_RING_M2S) {
-		head = ring->head;
+		head = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
 		n_slots = ring_size - head + mq->last_tail;
 
 		while (n_slots--) {
@@ -361,8 +366,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 			d0 = &ring->desc[s0];
 			d0->length = pmd->run.pkt_buffer_size;
 		}
-		rte_mb();
-		ring->head = head;
+		__atomic_store_n(&ring->head, head, __ATOMIC_RELEASE);
 	}
 
 	mq->n_pkts += n_rx_pkts;
@@ -398,14 +402,16 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	ring_size = 1 << mq->log2_ring_size;
 	mask = ring_size - 1;
 
-	n_free = ring->tail - mq->last_tail;
+	n_free = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE) - mq->last_tail;
 	mq->last_tail += n_free;
-	slot = (type == MEMIF_RING_S2M) ? ring->head : ring->tail;
 
-	if (type == MEMIF_RING_S2M)
-		n_free = ring_size - ring->head + mq->last_tail;
-	else
-		n_free = ring->head - ring->tail;
+	if (type == MEMIF_RING_S2M) {
+		slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
+		n_free = ring_size - slot + mq->last_tail;
+	} else {
+		slot = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
+		n_free = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE) - slot;
+	}
 
 	while (n_tx_pkts < nb_pkts && n_free) {
 		mbuf_head = *bufs++;
@@ -464,11 +470,10 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	}
 
 no_free_slots:
-	rte_mb();
 	if (type == MEMIF_RING_S2M)
-		ring->head = slot;
+		__atomic_store_n(&ring->head, slot, __ATOMIC_RELEASE);
 	else
-		ring->tail = slot;
+		__atomic_store_n(&ring->tail, slot, __ATOMIC_RELEASE);
 
 	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
 		a = 1;
@@ -611,8 +616,8 @@ memif_init_rings(struct rte_eth_dev *dev)
 
 	for (i = 0; i < pmd->run.num_s2m_rings; i++) {
 		ring = memif_get_ring(pmd, proc_private, MEMIF_RING_S2M, i);
-		ring->head = 0;
-		ring->tail = 0;
+		__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
@@ -627,8 +632,8 @@ memif_init_rings(struct rte_eth_dev *dev)
 
 	for (i = 0; i < pmd->run.num_m2s_rings; i++) {
 		ring = memif_get_ring(pmd, proc_private, MEMIF_RING_M2S, i);
-		ring->head = 0;
-		ring->tail = 0;
+		__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
@@ -734,8 +739,8 @@ memif_connect(struct rte_eth_dev *dev)
 				MIF_LOG(ERR, "Wrong ring");
 				return -1;
 			}
-			ring->head = 0;
-			ring->tail = 0;
+			__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+			__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 			mq->last_head = 0;
 			mq->last_tail = 0;
 			/* enable polling mode */
@@ -750,8 +755,8 @@ memif_connect(struct rte_eth_dev *dev)
 				MIF_LOG(ERR, "Wrong ring");
 				return -1;
 			}
-			ring->head = 0;
-			ring->tail = 0;
+			__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+			__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 			mq->last_head = 0;
 			mq->last_tail = 0;
 			/* enable polling mode */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier
  2019-08-26 11:00 [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier Phil Yang
@ 2019-08-26 11:03 ` Phil Yang (Arm Technology China)
  2019-10-08 11:05   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  2019-10-09  2:04 ` [dpdk-dev] [PATCH v2] " Phil Yang
  1 sibling, 1 reply; 9+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-08-26 11:03 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China), jgrajcia, dev
  Cc: thomas, jerinj, Honnappa Nagarahalli, damarion, nd,
	Gavin Hu (Arm Technology China),
	nd

+ Gavin

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> Sent: Monday, August 26, 2019 7:00 PM
> To: jgrajcia@cisco.com; dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; damarion@cisco.com; nd
> <nd@arm.com>
> Subject: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier
> 
> Using 'rte_mb' to synchronize the shared ring head/tail between producer
> and consumer will stall the pipeline and damage performance on the weak
> memory model platforms, such like aarch64. Meanwhile update the shared
> ring head and tail are observable and ordered between CPUs on IA.
> 
> Optimized this full barrier with the one-way barrier can improve the
> throughput. On aarch64 n1sdp server this patch make testpmd throughput
> boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
>  drivers/net/memif/memif.h         |  4 +--
>  drivers/net/memif/rte_eth_memif.c | 53 +++++++++++++++++++++--------
> ----------
>  2 files changed, 31 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/net/memif/memif.h b/drivers/net/memif/memif.h
> index 3948b1f..a4d88c0 100644
> --- a/drivers/net/memif/memif.h
> +++ b/drivers/net/memif/memif.h
> @@ -169,9 +169,9 @@ typedef struct {
>  	uint32_t cookie;			/**< MEMIF_COOKIE */
>  	uint16_t flags;				/**< flags */
>  #define MEMIF_RING_FLAG_MASK_INT 1		/**< disable interrupt
> mode */
> -	volatile uint16_t head;			/**< pointer to ring buffer
> head */
> +	uint16_t head;			/**< pointer to ring buffer head */
>  	MEMIF_CACHELINE_ALIGN_MARK(cacheline1);
> -	volatile uint16_t tail;			/**< pointer to ring buffer
> tail */
> +	uint16_t tail;			/**< pointer to ring buffer tail */
>  	MEMIF_CACHELINE_ALIGN_MARK(cacheline2);
>  	memif_desc_t desc[0];			/**< buffer descriptors */
>  } memif_ring_t;
> diff --git a/drivers/net/memif/rte_eth_memif.c
> b/drivers/net/memif/rte_eth_memif.c
> index a59f809..b1c871e 100644
> --- a/drivers/net/memif/rte_eth_memif.c
> +++ b/drivers/net/memif/rte_eth_memif.c
> @@ -275,8 +275,14 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs,
> uint16_t nb_pkts)
>  	ring_size = 1 << mq->log2_ring_size;
>  	mask = ring_size - 1;
> 
> -	cur_slot = (type == MEMIF_RING_S2M) ? mq->last_head : mq-
> >last_tail;
> -	last_slot = (type == MEMIF_RING_S2M) ? ring->head : ring->tail;
> +	if (type == MEMIF_RING_S2M) {
> +		cur_slot = mq->last_head;
> +		last_slot = __atomic_load_n(&ring->head,
> __ATOMIC_ACQUIRE);
> +	} else {
> +		cur_slot = mq->last_tail;
> +		last_slot = __atomic_load_n(&ring->tail,
> __ATOMIC_ACQUIRE);
> +	}
> +
>  	if (cur_slot == last_slot)
>  		goto refill;
>  	n_slots = last_slot - cur_slot;
> @@ -344,8 +350,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs,
> uint16_t nb_pkts)
> 
>  no_free_bufs:
>  	if (type == MEMIF_RING_S2M) {
> -		rte_mb();
> -		ring->tail = cur_slot;
> +		__atomic_store_n(&ring->tail, cur_slot,
> __ATOMIC_RELEASE);
>  		mq->last_head = cur_slot;
>  	} else {
>  		mq->last_tail = cur_slot;
> @@ -353,7 +358,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs,
> uint16_t nb_pkts)
> 
>  refill:
>  	if (type == MEMIF_RING_M2S) {
> -		head = ring->head;
> +		head = __atomic_load_n(&ring->head,
> __ATOMIC_ACQUIRE);
>  		n_slots = ring_size - head + mq->last_tail;
> 
>  		while (n_slots--) {
> @@ -361,8 +366,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs,
> uint16_t nb_pkts)
>  			d0 = &ring->desc[s0];
>  			d0->length = pmd->run.pkt_buffer_size;
>  		}
> -		rte_mb();
> -		ring->head = head;
> +		__atomic_store_n(&ring->head, head, __ATOMIC_RELEASE);
>  	}
> 
>  	mq->n_pkts += n_rx_pkts;
> @@ -398,14 +402,16 @@ eth_memif_tx(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
>  	ring_size = 1 << mq->log2_ring_size;
>  	mask = ring_size - 1;
> 
> -	n_free = ring->tail - mq->last_tail;
> +	n_free = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE) - mq-
> >last_tail;
>  	mq->last_tail += n_free;
> -	slot = (type == MEMIF_RING_S2M) ? ring->head : ring->tail;
> 
> -	if (type == MEMIF_RING_S2M)
> -		n_free = ring_size - ring->head + mq->last_tail;
> -	else
> -		n_free = ring->head - ring->tail;
> +	if (type == MEMIF_RING_S2M) {
> +		slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
> +		n_free = ring_size - slot + mq->last_tail;
> +	} else {
> +		slot = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
> +		n_free = __atomic_load_n(&ring->head,
> __ATOMIC_ACQUIRE) - slot;
> +	}
> 
>  	while (n_tx_pkts < nb_pkts && n_free) {
>  		mbuf_head = *bufs++;
> @@ -464,11 +470,10 @@ eth_memif_tx(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
>  	}
> 
>  no_free_slots:
> -	rte_mb();
>  	if (type == MEMIF_RING_S2M)
> -		ring->head = slot;
> +		__atomic_store_n(&ring->head, slot, __ATOMIC_RELEASE);
>  	else
> -		ring->tail = slot;
> +		__atomic_store_n(&ring->tail, slot, __ATOMIC_RELEASE);
> 
>  	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
>  		a = 1;
> @@ -611,8 +616,8 @@ memif_init_rings(struct rte_eth_dev *dev)
> 
>  	for (i = 0; i < pmd->run.num_s2m_rings; i++) {
>  		ring = memif_get_ring(pmd, proc_private,
> MEMIF_RING_S2M, i);
> -		ring->head = 0;
> -		ring->tail = 0;
> +		__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
> +		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
>  		ring->cookie = MEMIF_COOKIE;
>  		ring->flags = 0;
>  		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
> @@ -627,8 +632,8 @@ memif_init_rings(struct rte_eth_dev *dev)
> 
>  	for (i = 0; i < pmd->run.num_m2s_rings; i++) {
>  		ring = memif_get_ring(pmd, proc_private,
> MEMIF_RING_M2S, i);
> -		ring->head = 0;
> -		ring->tail = 0;
> +		__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
> +		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
>  		ring->cookie = MEMIF_COOKIE;
>  		ring->flags = 0;
>  		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
> @@ -734,8 +739,8 @@ memif_connect(struct rte_eth_dev *dev)
>  				MIF_LOG(ERR, "Wrong ring");
>  				return -1;
>  			}
> -			ring->head = 0;
> -			ring->tail = 0;
> +			__atomic_store_n(&ring->head, 0,
> __ATOMIC_RELAXED);
> +			__atomic_store_n(&ring->tail, 0,
> __ATOMIC_RELAXED);
>  			mq->last_head = 0;
>  			mq->last_tail = 0;
>  			/* enable polling mode */
> @@ -750,8 +755,8 @@ memif_connect(struct rte_eth_dev *dev)
>  				MIF_LOG(ERR, "Wrong ring");
>  				return -1;
>  			}
> -			ring->head = 0;
> -			ring->tail = 0;
> +			__atomic_store_n(&ring->head, 0,
> __ATOMIC_RELAXED);
> +			__atomic_store_n(&ring->tail, 0,
> __ATOMIC_RELAXED);
>  			mq->last_head = 0;
>  			mq->last_tail = 0;
>  			/* enable polling mode */
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier
  2019-08-26 11:03 ` Phil Yang (Arm Technology China)
@ 2019-10-08 11:05   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  2019-10-09  2:10     ` Phil Yang (Arm Technology China)
  0 siblings, 1 reply; 9+ messages in thread
From: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco) @ 2019-10-08 11:05 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China), dev
  Cc: thomas, jerinj, Honnappa Nagarahalli, Damjan Marion (damarion),
	nd, Gavin Hu (Arm Technology China),
	nd

> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > Sent: Monday, August 26, 2019 7:00 PM
> > To: jgrajcia@cisco.com; dev@dpdk.org
> > Cc: thomas@monjalon.net; jerinj@marvell.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; damarion@cisco.com; nd
> <nd@arm.com>
> > Subject: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way
> > barrier
> >
> > Using 'rte_mb' to synchronize the shared ring head/tail between
> > producer and consumer will stall the pipeline and damage performance
> > on the weak memory model platforms, such like aarch64. Meanwhile
> > update the shared ring head and tail are observable and ordered between
> CPUs on IA.
> >
> > Optimized this full barrier with the one-way barrier can improve the
> > throughput. On aarch64 n1sdp server this patch make testpmd throughput
> > boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>

The patch is looking good, but 'MEMIF_VERSION_MAJOR' in memif.h needs to
be set to 3 as ring pointers are no longer volatile.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH v2] net/memif:  optimized with one-way barrier
  2019-08-26 11:00 [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier Phil Yang
  2019-08-26 11:03 ` Phil Yang (Arm Technology China)
@ 2019-10-09  2:04 ` " Phil Yang
  2019-10-09 11:14   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  1 sibling, 1 reply; 9+ messages in thread
From: Phil Yang @ 2019-10-09  2:04 UTC (permalink / raw)
  To: jgrajcia, ferruh.yigit, dev
  Cc: thomas, damarion, Honnappa.Nagarahalli, gavin.hu, nd

Using 'rte_mb' to synchronize the shared ring head/tail between producer
and consumer will stall the pipeline and damage performance on the weak
memory model platform such like AArch64. Meanwhile update the shared
ring head and tail are observable and ordered between CPUs on IA.

Optimized this full barrier with the one-way barrier can improve the
throughput. On AArch64 n1sdp server this patch make testpmd throughput
boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
v2:
Upgrade 'MEMIF_VERSION_MAJOR' number to 3. (Jakub Grajciar)

v1:
Initial version.

 drivers/net/memif/memif.h         |  6 ++---
 drivers/net/memif/rte_eth_memif.c | 53 +++++++++++++++++++++------------------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/drivers/net/memif/memif.h b/drivers/net/memif/memif.h
index 3948b1f..8000aa1 100644
--- a/drivers/net/memif/memif.h
+++ b/drivers/net/memif/memif.h
@@ -6,7 +6,7 @@
 #define _MEMIF_H_
 
 #define MEMIF_COOKIE		0x3E31F20
-#define MEMIF_VERSION_MAJOR	2
+#define MEMIF_VERSION_MAJOR	3
 #define MEMIF_VERSION_MINOR	0
 #define MEMIF_VERSION		((MEMIF_VERSION_MAJOR << 8) | MEMIF_VERSION_MINOR)
 #define MEMIF_NAME_SZ		32
@@ -169,9 +169,9 @@ typedef struct {
 	uint32_t cookie;			/**< MEMIF_COOKIE */
 	uint16_t flags;				/**< flags */
 #define MEMIF_RING_FLAG_MASK_INT 1		/**< disable interrupt mode */
-	volatile uint16_t head;			/**< pointer to ring buffer head */
+	uint16_t head;			/**< pointer to ring buffer head */
 	MEMIF_CACHELINE_ALIGN_MARK(cacheline1);
-	volatile uint16_t tail;			/**< pointer to ring buffer tail */
+	uint16_t tail;			/**< pointer to ring buffer tail */
 	MEMIF_CACHELINE_ALIGN_MARK(cacheline2);
 	memif_desc_t desc[0];			/**< buffer descriptors */
 } memif_ring_t;
diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index a59f809..b1c871e 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -275,8 +275,14 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	ring_size = 1 << mq->log2_ring_size;
 	mask = ring_size - 1;
 
-	cur_slot = (type == MEMIF_RING_S2M) ? mq->last_head : mq->last_tail;
-	last_slot = (type == MEMIF_RING_S2M) ? ring->head : ring->tail;
+	if (type == MEMIF_RING_S2M) {
+		cur_slot = mq->last_head;
+		last_slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
+	} else {
+		cur_slot = mq->last_tail;
+		last_slot = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
+	}
+
 	if (cur_slot == last_slot)
 		goto refill;
 	n_slots = last_slot - cur_slot;
@@ -344,8 +350,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
 no_free_bufs:
 	if (type == MEMIF_RING_S2M) {
-		rte_mb();
-		ring->tail = cur_slot;
+		__atomic_store_n(&ring->tail, cur_slot, __ATOMIC_RELEASE);
 		mq->last_head = cur_slot;
 	} else {
 		mq->last_tail = cur_slot;
@@ -353,7 +358,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 
 refill:
 	if (type == MEMIF_RING_M2S) {
-		head = ring->head;
+		head = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
 		n_slots = ring_size - head + mq->last_tail;
 
 		while (n_slots--) {
@@ -361,8 +366,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 			d0 = &ring->desc[s0];
 			d0->length = pmd->run.pkt_buffer_size;
 		}
-		rte_mb();
-		ring->head = head;
+		__atomic_store_n(&ring->head, head, __ATOMIC_RELEASE);
 	}
 
 	mq->n_pkts += n_rx_pkts;
@@ -398,14 +402,16 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	ring_size = 1 << mq->log2_ring_size;
 	mask = ring_size - 1;
 
-	n_free = ring->tail - mq->last_tail;
+	n_free = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE) - mq->last_tail;
 	mq->last_tail += n_free;
-	slot = (type == MEMIF_RING_S2M) ? ring->head : ring->tail;
 
-	if (type == MEMIF_RING_S2M)
-		n_free = ring_size - ring->head + mq->last_tail;
-	else
-		n_free = ring->head - ring->tail;
+	if (type == MEMIF_RING_S2M) {
+		slot = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE);
+		n_free = ring_size - slot + mq->last_tail;
+	} else {
+		slot = __atomic_load_n(&ring->tail, __ATOMIC_ACQUIRE);
+		n_free = __atomic_load_n(&ring->head, __ATOMIC_ACQUIRE) - slot;
+	}
 
 	while (n_tx_pkts < nb_pkts && n_free) {
 		mbuf_head = *bufs++;
@@ -464,11 +470,10 @@ eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	}
 
 no_free_slots:
-	rte_mb();
 	if (type == MEMIF_RING_S2M)
-		ring->head = slot;
+		__atomic_store_n(&ring->head, slot, __ATOMIC_RELEASE);
 	else
-		ring->tail = slot;
+		__atomic_store_n(&ring->tail, slot, __ATOMIC_RELEASE);
 
 	if ((ring->flags & MEMIF_RING_FLAG_MASK_INT) == 0) {
 		a = 1;
@@ -611,8 +616,8 @@ memif_init_rings(struct rte_eth_dev *dev)
 
 	for (i = 0; i < pmd->run.num_s2m_rings; i++) {
 		ring = memif_get_ring(pmd, proc_private, MEMIF_RING_S2M, i);
-		ring->head = 0;
-		ring->tail = 0;
+		__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
@@ -627,8 +632,8 @@ memif_init_rings(struct rte_eth_dev *dev)
 
 	for (i = 0; i < pmd->run.num_m2s_rings; i++) {
 		ring = memif_get_ring(pmd, proc_private, MEMIF_RING_M2S, i);
-		ring->head = 0;
-		ring->tail = 0;
+		__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+		__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 		ring->cookie = MEMIF_COOKIE;
 		ring->flags = 0;
 		for (j = 0; j < (1 << pmd->run.log2_ring_size); j++) {
@@ -734,8 +739,8 @@ memif_connect(struct rte_eth_dev *dev)
 				MIF_LOG(ERR, "Wrong ring");
 				return -1;
 			}
-			ring->head = 0;
-			ring->tail = 0;
+			__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+			__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 			mq->last_head = 0;
 			mq->last_tail = 0;
 			/* enable polling mode */
@@ -750,8 +755,8 @@ memif_connect(struct rte_eth_dev *dev)
 				MIF_LOG(ERR, "Wrong ring");
 				return -1;
 			}
-			ring->head = 0;
-			ring->tail = 0;
+			__atomic_store_n(&ring->head, 0, __ATOMIC_RELAXED);
+			__atomic_store_n(&ring->tail, 0, __ATOMIC_RELAXED);
 			mq->last_head = 0;
 			mq->last_tail = 0;
 			/* enable polling mode */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier
  2019-10-08 11:05   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
@ 2019-10-09  2:10     ` Phil Yang (Arm Technology China)
  2019-10-09 11:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  0 siblings, 1 reply; 9+ messages in thread
From: Phil Yang (Arm Technology China) @ 2019-10-09  2:10 UTC (permalink / raw)
  To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco), dev
  Cc: thomas, jerinj, Honnappa Nagarahalli, Damjan Marion (damarion),
	nd, Gavin Hu (Arm Technology China),
	ferruh.yigit, nd

> -----Original Message-----
> From: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
> <jgrajcia@cisco.com>
> Sent: Tuesday, October 8, 2019 7:05 PM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Damjan Marion (damarion)
> <damarion@cisco.com>; nd <nd@arm.com>; Gavin Hu (Arm Technology
> China) <Gavin.Hu@arm.com>; nd <nd@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way
> barrier
> 
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > Sent: Monday, August 26, 2019 7:00 PM
> > > To: jgrajcia@cisco.com; dev@dpdk.org
> > > Cc: thomas@monjalon.net; jerinj@marvell.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; damarion@cisco.com; nd
> > <nd@arm.com>
> > > Subject: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way
> > > barrier
> > >
> > > Using 'rte_mb' to synchronize the shared ring head/tail between
> > > producer and consumer will stall the pipeline and damage performance
> > > on the weak memory model platforms, such like aarch64. Meanwhile
> > > update the shared ring head and tail are observable and ordered
> between
> > CPUs on IA.
> > >
> > > Optimized this full barrier with the one-way barrier can improve the
> > > throughput. On aarch64 n1sdp server this patch make testpmd
> throughput
> > > boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.
> > >
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> 
> The patch is looking good, but 'MEMIF_VERSION_MAJOR' in memif.h needs
> to
> be set to 3 as ring pointers are no longer volatile.

Updated in v2. 
Thanks for your comments.

Thanks,
Phil

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/memif: optimized with one-way barrier
  2019-10-09  2:04 ` [dpdk-dev] [PATCH v2] " Phil Yang
@ 2019-10-09 11:14   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  2019-10-10 14:04     ` David Marchand
  0 siblings, 1 reply; 9+ messages in thread
From: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco) @ 2019-10-09 11:14 UTC (permalink / raw)
  To: Phil Yang, ferruh.yigit, dev
  Cc: thomas, Damjan Marion (damarion), Honnappa.Nagarahalli, gavin.hu, nd



> -----Original Message-----
> From: Phil Yang <phil.yang@arm.com>
> Sent: Wednesday, October 9, 2019 4:05 AM
> To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
> <jgrajcia@cisco.com>; ferruh.yigit@intel.com; dev@dpdk.org
> Cc: thomas@monjalon.net; Damjan Marion (damarion)
> <damarion@cisco.com>; Honnappa.Nagarahalli@arm.com;
> gavin.hu@arm.com; nd@arm.com
> Subject: [PATCH v2] net/memif: optimized with one-way barrier
> 
> Using 'rte_mb' to synchronize the shared ring head/tail between producer and
> consumer will stall the pipeline and damage performance on the weak
> memory model platform such like AArch64. Meanwhile update the shared
> ring head and tail are observable and ordered between CPUs on IA.
> 
> Optimized this full barrier with the one-way barrier can improve the
> throughput. On AArch64 n1sdp server this patch make testpmd throughput
> boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
> v2:
> Upgrade 'MEMIF_VERSION_MAJOR' number to 3. (Jakub Grajciar)
> 
> v1:
> Initial version.

I jumped the gun with the version bump. The change doesn't break compatibility. I'm putting reviewed label on v1.

Sorry for the inconvenience.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier
  2019-10-09  2:10     ` Phil Yang (Arm Technology China)
@ 2019-10-09 11:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  2019-10-14  8:59         ` Ferruh Yigit
  0 siblings, 1 reply; 9+ messages in thread
From: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco) @ 2019-10-09 11:17 UTC (permalink / raw)
  To: Phil Yang (Arm Technology China), dev
  Cc: thomas, jerinj, Honnappa Nagarahalli, Damjan Marion (damarion),
	nd, Gavin Hu (Arm Technology China),
	ferruh.yigit, nd



> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > > Sent: Monday, August 26, 2019 7:00 PM
> > > > To: jgrajcia@cisco.com; dev@dpdk.org
> > > > Cc: thomas@monjalon.net; jerinj@marvell.com; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>; damarion@cisco.com; nd
> > > <nd@arm.com>
> > > > Subject: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way
> > > > barrier
> > > >
> > > > Using 'rte_mb' to synchronize the shared ring head/tail between
> > > > producer and consumer will stall the pipeline and damage
> > > > performance on the weak memory model platforms, such like aarch64.
> > > > Meanwhile update the shared ring head and tail are observable and
> > > > ordered
> > between
> > > CPUs on IA.
> > > >
> > > > Optimized this full barrier with the one-way barrier can improve
> > > > the throughput. On aarch64 n1sdp server this patch make testpmd
> > throughput
> > > > boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.
> > > >
> > > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> >
> > The patch is looking good, but 'MEMIF_VERSION_MAJOR' in memif.h needs
> > to be set to 3 as ring pointers are no longer volatile.
> 
> Updated in v2.
> Thanks for your comments.
> 
> Thanks,
> Phil

I jumped the gun with the version bump. The change doesn't break compatibility. I'm putting reviewed label on v1.

Sorry for the inconvenience.

Reviewed-by: Jakub Grajciar <jgrajcia@cisco.com>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2] net/memif: optimized with one-way barrier
  2019-10-09 11:14   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
@ 2019-10-10 14:04     ` David Marchand
  0 siblings, 0 replies; 9+ messages in thread
From: David Marchand @ 2019-10-10 14:04 UTC (permalink / raw)
  To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
  Cc: Phil Yang, ferruh.yigit, dev, thomas, Damjan Marion (damarion),
	Honnappa.Nagarahalli, gavin.hu, nd

On Wed, Oct 9, 2019 at 1:15 PM Jakub Grajciar -X (jgrajcia - PANTHEON
TECHNOLOGIES at Cisco) <jgrajcia@cisco.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Wednesday, October 9, 2019 4:05 AM
> > To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
> > <jgrajcia@cisco.com>; ferruh.yigit@intel.com; dev@dpdk.org
> > Cc: thomas@monjalon.net; Damjan Marion (damarion)
> > <damarion@cisco.com>; Honnappa.Nagarahalli@arm.com;
> > gavin.hu@arm.com; nd@arm.com
> > Subject: [PATCH v2] net/memif: optimized with one-way barrier
> >
> > Using 'rte_mb' to synchronize the shared ring head/tail between producer and
> > consumer will stall the pipeline and damage performance on the weak
> > memory model platform such like AArch64. Meanwhile update the shared
> > ring head and tail are observable and ordered between CPUs on IA.
> >
> > Optimized this full barrier with the one-way barrier can improve the
> > throughput. On AArch64 n1sdp server this patch make testpmd throughput
> > boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > ---
> > v2:
> > Upgrade 'MEMIF_VERSION_MAJOR' number to 3. (Jakub Grajciar)
> >
> > v1:
> > Initial version.
>
> I jumped the gun with the version bump. The change doesn't break compatibility. I'm putting reviewed label on v1.
>
> Sorry for the inconvenience.

Ok, marking v2 as rejected then.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier
  2019-10-09 11:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
@ 2019-10-14  8:59         ` Ferruh Yigit
  0 siblings, 0 replies; 9+ messages in thread
From: Ferruh Yigit @ 2019-10-14  8:59 UTC (permalink / raw)
  To: Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco),
	Phil Yang (Arm Technology China),
	dev
  Cc: thomas, jerinj, Honnappa Nagarahalli, Damjan Marion (damarion),
	nd, Gavin Hu (Arm Technology China)

On 10/9/2019 12:17 PM, Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at
Cisco) wrote:
> 
> 
>>>>> -----Original Message-----
>>>>> From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
>>>>> Sent: Monday, August 26, 2019 7:00 PM
>>>>> To: jgrajcia@cisco.com; dev@dpdk.org
>>>>> Cc: thomas@monjalon.net; jerinj@marvell.com; Honnappa Nagarahalli
>>>>> <Honnappa.Nagarahalli@arm.com>; damarion@cisco.com; nd
>>>> <nd@arm.com>
>>>>> Subject: [dpdk-dev] [PATCH v1] net/memif: optimized with one-way
>>>>> barrier
>>>>>
>>>>> Using 'rte_mb' to synchronize the shared ring head/tail between
>>>>> producer and consumer will stall the pipeline and damage
>>>>> performance on the weak memory model platforms, such like aarch64.
>>>>> Meanwhile update the shared ring head and tail are observable and
>>>>> ordered
>>> between
>>>> CPUs on IA.
>>>>>
>>>>> Optimized this full barrier with the one-way barrier can improve
>>>>> the throughput. On aarch64 n1sdp server this patch make testpmd
>>> throughput
>>>>> boost 2.1%. On Intel E5-2640, testpmd got 3.98% performance gain.
>>>>>
>>>>> Signed-off-by: Phil Yang <phil.yang@arm.com>
>>>>> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>>>
>>> The patch is looking good, but 'MEMIF_VERSION_MAJOR' in memif.h needs
>>> to be set to 3 as ring pointers are no longer volatile.
>>
>> Updated in v2.
>> Thanks for your comments.
>>
>> Thanks,
>> Phil
> 
> I jumped the gun with the version bump. The change doesn't break compatibility. I'm putting reviewed label on v1.
> 
> Sorry for the inconvenience.
> 
> Reviewed-by: Jakub Grajciar <jgrajcia@cisco.com>
> 

Applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, back to index

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-26 11:00 [dpdk-dev] [PATCH v1] net/memif: optimized with one-way barrier Phil Yang
2019-08-26 11:03 ` Phil Yang (Arm Technology China)
2019-10-08 11:05   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
2019-10-09  2:10     ` Phil Yang (Arm Technology China)
2019-10-09 11:17       ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
2019-10-14  8:59         ` Ferruh Yigit
2019-10-09  2:04 ` [dpdk-dev] [PATCH v2] " Phil Yang
2019-10-09 11:14   ` Jakub Grajciar -X (jgrajcia - PANTHEON TECHNOLOGIES at Cisco)
2019-10-10 14:04     ` David Marchand

DPDK patches and discussions

Archives are clonable:
	git clone --mirror http://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ http://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev


Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/ public-inbox