DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function
@ 2020-06-11 10:11 Radu Nicolau
  2020-06-11 10:11 ` [dpdk-dev] [PATCH v1 2/2] net/i40e: use movdiri to update queue tail registers Radu Nicolau
                   ` (13 more replies)
  0 siblings, 14 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-06-11 10:11 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev, Radu Nicolau

Add rte_write32_wc function that implements a WC store
using movdiri instruction.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 lib/librte_eal/x86/include/rte_io.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..3d74bec 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -11,6 +11,26 @@ extern "C" {
 
 #include "generic/rte_io.h"
 
+/**
+ * Write a 32-bit value to I/O device memory address *addr*.
+ * Uses MOVDIRI instruction to perform a direct-store operation using WC
+ * memory write protocol.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	asm volatile("sfence\n\t"
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v1 2/2] net/i40e: use movdiri to update queue tail registers
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
@ 2020-06-11 10:11 ` Radu Nicolau
  2020-06-11 12:23 ` [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Jerin Jacob
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-06-11 10:11 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev, Radu Nicolau

If available use movdiri instruction instead of a regular mmio write to
update queue tail registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 20 ++++++++++++++++++++
 drivers/net/i40e/i40e_ethdev_vf.c     | 10 ++++++++++
 drivers/net/i40e/i40e_fdir.c          |  4 ++++
 drivers/net/i40e/i40e_rxtx.c          | 19 +++++++++++++++----
 drivers/net/i40e/i40e_rxtx.h          |  2 ++
 drivers/net/i40e/i40e_rxtx_vec_avx2.c |  4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  |  4 ++--
 7 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..b642c6f 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,26 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#if defined(RTE_ARCH_X86)
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value, ...)			\
+	do {								\
+		uint32_t val = rte_cpu_to_le_32(value);			\
+		volatile void *addr = reg;				\
+		if (queue->use_movdiri)					\
+			rte_write32_wc(val, addr);			\
+		else							\
+			rte_write32##__VA_ARGS__(val, addr);		\
+	} while (0)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+		I40E_PCI_REG_WC_WRITE(queue, reg, value, _relaxed)
+#else
+	#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+		I40E_PCI_REG_WRITE(reg, value)
+	#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+		I40E_PCI_REG_WRITE_RELAXED(reg, value)
+#endif
+
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c
index eca716a..6a82b7b 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -1790,6 +1790,11 @@ i40evf_rxq_init(struct rte_eth_dev *dev, struct i40e_rx_queue *rxq)
 	rxq->max_pkt_len = RTE_MIN(len,
 		dev_data->dev_conf.rxmode.max_rx_pkt_len);
 
+#if defined(RTE_ARCH_X86)
+	/* use MOVDIRI if supported*/
+	rxq->use_movdiri = rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI);
+#endif
+
 	/**
 	 * Check if the jumbo frame and maximum packet length are set correctly
 	 */
@@ -1855,6 +1860,11 @@ i40evf_tx_init(struct rte_eth_dev *dev)
 	for (i = 0; i < dev->data->nb_tx_queues; i++)
 		txq[i]->qtx_tail = hw->hw_addr + I40E_QTX_TAIL1(i);
 
+#if defined(RTE_ARCH_X86)
+       /* use MOVDIRI if supported*/
+	txq[i]->use_movdiri = rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI);
+#endif
+
 	i40e_set_tx_function(dev);
 }
 
diff --git a/drivers/net/i40e/i40e_fdir.c b/drivers/net/i40e/i40e_fdir.c
index d59399a..6f1bc86 100644
--- a/drivers/net/i40e/i40e_fdir.c
+++ b/drivers/net/i40e/i40e_fdir.c
@@ -142,6 +142,10 @@ i40e_fdir_rx_queue_init(struct i40e_rx_queue *rxq)
 	}
 	rxq->qrx_tail = hw->hw_addr +
 		I40E_QRX_TAIL(rxq->vsi->base_queue);
+#if defined(RTE_ARCH_X86)
+	/* use MOVDIRI if supported*/
+	rxq->use_movdiri = rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI);
+#endif
 
 	rte_wmb();
 	/* Init the RX tail regieter. */
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..44bba68 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -2717,6 +2717,12 @@ i40e_tx_queue_init(struct i40e_tx_queue *txq)
 
 	txq->qtx_tail = hw->hw_addr + I40E_QTX_TAIL(pf_q);
 
+#if defined(RTE_ARCH_X86)
+       /* use MOVDIRI if supported*/
+	txq->use_movdiri = rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI);
+#endif
+
+
 	return err;
 }
 
@@ -2881,6 +2887,11 @@ i40e_rx_queue_init(struct i40e_rx_queue *rxq)
 
 	rxq->qrx_tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
 
+#if defined(RTE_ARCH_X86)
+	/* use MOVDIRI WC store if supported*/
+	rxq->use_movdiri = rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI);
+#endif
+
 	buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mp) -
 		RTE_PKTMBUF_HEADROOM);
 
diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 8f11f01..9c9d676 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -118,6 +118,7 @@ struct i40e_rx_queue {
 	uint16_t rx_using_sse; /**<flag indicate the usage of vPMD for rx */
 	uint8_t dcb_tc;         /**< Traffic class of rx queue */
 	uint64_t offloads; /**< Rx offload flags of DEV_RX_OFFLOAD_* */
+	uint8_t use_movdiri; /**< use MOVDIRI if supported */
 };
 
 struct i40e_tx_entry {
@@ -159,6 +160,7 @@ struct i40e_tx_queue {
 	bool tx_deferred_start; /**< don't start this queue in dev start */
 	uint8_t dcb_tc;         /**< Traffic class of tx queue */
 	uint64_t offloads; /**< Tx offload flags of DEV_RX_OFFLOAD_* */
+	uint8_t use_movdiri; /**< use MOVDIRI if supported */
 };
 
 /** Offload features */
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
  2020-06-11 10:11 ` [dpdk-dev] [PATCH v1 2/2] net/i40e: use movdiri to update queue tail registers Radu Nicolau
@ 2020-06-11 12:23 ` Jerin Jacob
  2020-06-11 13:56   ` Nicolau, Radu
  2020-06-15 11:11 ` Ananyev, Konstantin
                   ` (11 subsequent siblings)
  13 siblings, 1 reply; 76+ messages in thread
From: Jerin Jacob @ 2020-06-11 12:23 UTC (permalink / raw)
  To: Radu Nicolau
  Cc: dpdk-dev, Beilei Xing, jia.guo, Richardson, Bruce, Ananyev, Konstantin

On Thu, Jun 11, 2020 at 3:41 PM Radu Nicolau <radu.nicolau@intel.com> wrote:
>
> Add rte_write32_wc function that implements a WC store
> using movdiri instruction.
>
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> ---
>  lib/librte_eal/x86/include/rte_io.h | 20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
>
> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..3d74bec 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -11,6 +11,26 @@ extern "C" {
>
>  #include "generic/rte_io.h"
>
> +/**
> + * Write a 32-bit value to I/O device memory address *addr*.
> + * Uses MOVDIRI instruction to perform a direct-store operation using WC
> + * memory write protocol.

It will be an x86 specific API, Please change the API name to reflect that.


> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       asm volatile("sfence\n\t"
> +               /* MOVDIRI */
> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +               :
> +               : "a" (value), "d" (addr));
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function
  2020-06-11 12:23 ` [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Jerin Jacob
@ 2020-06-11 13:56   ` Nicolau, Radu
  2020-06-11 15:33     ` Jerin Jacob
  0 siblings, 1 reply; 76+ messages in thread
From: Nicolau, Radu @ 2020-06-11 13:56 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dpdk-dev, Beilei Xing, jia.guo, Richardson, Bruce, Ananyev, Konstantin


On 6/11/2020 1:23 PM, Jerin Jacob wrote:
> On Thu, Jun 11, 2020 at 3:41 PM Radu Nicolau <radu.nicolau@intel.com> wrote:
>> Add rte_write32_wc function that implements a WC store
>> using movdiri instruction.
>>
>> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
>> ---
>>   lib/librte_eal/x86/include/rte_io.h | 20 ++++++++++++++++++++
>>   1 file changed, 20 insertions(+)
>>
>> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
>> index 2db71b1..3d74bec 100644
>> --- a/lib/librte_eal/x86/include/rte_io.h
>> +++ b/lib/librte_eal/x86/include/rte_io.h
>> @@ -11,6 +11,26 @@ extern "C" {
>>
>>   #include "generic/rte_io.h"
>>
>> +/**
>> + * Write a 32-bit value to I/O device memory address *addr*.
>> + * Uses MOVDIRI instruction to perform a direct-store operation using WC
>> + * memory write protocol.
> It will be an x86 specific API, Please change the API name to reflect that.
>
You mean something like rte_x86_write32_wc?

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function
  2020-06-11 13:56   ` Nicolau, Radu
@ 2020-06-11 15:33     ` Jerin Jacob
  0 siblings, 0 replies; 76+ messages in thread
From: Jerin Jacob @ 2020-06-11 15:33 UTC (permalink / raw)
  To: Nicolau, Radu
  Cc: dpdk-dev, Beilei Xing, jia.guo, Richardson, Bruce, Ananyev, Konstantin

On Thu, Jun 11, 2020 at 7:26 PM Nicolau, Radu <radu.nicolau@intel.com> wrote:
>
>
> On 6/11/2020 1:23 PM, Jerin Jacob wrote:
> > On Thu, Jun 11, 2020 at 3:41 PM Radu Nicolau <radu.nicolau@intel.com> wrote:
> >> Add rte_write32_wc function that implements a WC store
> >> using movdiri instruction.
> >>
> >> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> >> ---
> >>   lib/librte_eal/x86/include/rte_io.h | 20 ++++++++++++++++++++
> >>   1 file changed, 20 insertions(+)
> >>
> >> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> >> index 2db71b1..3d74bec 100644
> >> --- a/lib/librte_eal/x86/include/rte_io.h
> >> +++ b/lib/librte_eal/x86/include/rte_io.h
> >> @@ -11,6 +11,26 @@ extern "C" {
> >>
> >>   #include "generic/rte_io.h"
> >>
> >> +/**
> >> + * Write a 32-bit value to I/O device memory address *addr*.
> >> + * Uses MOVDIRI instruction to perform a direct-store operation using WC
> >> + * memory write protocol.
> > It will be an x86 specific API, Please change the API name to reflect that.
> >
> You mean something like rte_x86_write32_wc?

Yes. Something like that...

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
  2020-06-11 10:11 ` [dpdk-dev] [PATCH v1 2/2] net/i40e: use movdiri to update queue tail registers Radu Nicolau
  2020-06-11 12:23 ` [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Jerin Jacob
@ 2020-06-15 11:11 ` Ananyev, Konstantin
  2020-06-19 12:06 ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Radu Nicolau
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 76+ messages in thread
From: Ananyev, Konstantin @ 2020-06-15 11:11 UTC (permalink / raw)
  To: Nicolau, Radu, dev; +Cc: Xing, Beilei, Guo, Jia, Richardson, Bruce

Hi Radu,

> 
> Add rte_write32_wc function that implements a WC store
> using movdiri instruction.

Probably worth to add 1-2 lines of text
explaining what are the advantages (perf improvement or whatever). 

> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> ---
>  lib/librte_eal/x86/include/rte_io.h | 20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
> 
> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..3d74bec 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -11,6 +11,26 @@ extern "C" {
> 
>  #include "generic/rte_io.h"
> 
> +/**
> + * Write a 32-bit value to I/O device memory address *addr*.
> + * Uses MOVDIRI instruction to perform a direct-store operation using WC
> + * memory write protocol.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +	asm volatile("sfence\n\t"

Why not rte_wmb()?

> +		/* MOVDIRI */
> +		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +		:
> +		: "a" (value), "d" (addr));
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (2 preceding siblings ...)
  2020-06-15 11:11 ` Ananyev, Konstantin
@ 2020-06-19 12:06 ` Radu Nicolau
  2020-06-19 12:06   ` [dpdk-dev] [PATCH v2 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
  2020-07-01 13:14   ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Bruce Richardson
  2020-07-01 14:15 ` [dpdk-dev] [PATCH v3 0/2] " Radu Nicolau
                   ` (9 subsequent siblings)
  13 siblings, 2 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-06-19 12:06 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
v2 rework new eal io functions

 lib/librte_eal/include/generic/rte_io.h | 47 ++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 59 +++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..7391782 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,39 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..5efbf0d 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,67 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
+
+
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] net/i40e: use WC store to update queue tail registers
  2020-06-19 12:06 ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Radu Nicolau
@ 2020-06-19 12:06   ` Radu Nicolau
  2020-07-01 13:15     ` Bruce Richardson
  2020-07-01 13:14   ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Bruce Richardson
  1 sibling, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-06-19 12:06 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 6 ++++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..df414fd 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,12 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..64e43ac 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions
  2020-06-19 12:06 ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Radu Nicolau
  2020-06-19 12:06   ` [dpdk-dev] [PATCH v2 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-01 13:14   ` Bruce Richardson
  1 sibling, 0 replies; 76+ messages in thread
From: Bruce Richardson @ 2020-07-01 13:14 UTC (permalink / raw)
  To: Radu Nicolau; +Cc: dev, beilei.xing, jia.guo, konstantin.ananyev, jerinjacobk

On Fri, Jun 19, 2020 at 01:06:19PM +0100, Radu Nicolau wrote:
> Add rte_write32_wc and rte_write32_wc_relaxed functions
> that implement 32bit stores using write combining memory protocol.
> Provided generic stubs and x86 implementation.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> ---
> v2 rework new eal io functions
> 

I think you need to add into the cover letter, as suggest on V1 by
Konstantin, a bit about using WC memory for better performance.

>  lib/librte_eal/include/generic/rte_io.h | 47 ++++++++++++++++++++++++++
>  lib/librte_eal/x86/include/rte_io.h     | 59 +++++++++++++++++++++++++++++++++
>  2 files changed, 106 insertions(+)
> 
> diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
> index da457f7..7391782 100644
> --- a/lib/librte_eal/include/generic/rte_io.h
> +++ b/lib/librte_eal/include/generic/rte_io.h
> @@ -229,6 +229,39 @@ rte_write32(uint32_t value, volatile void *addr);
>  static inline void
>  rte_write64(uint64_t value, volatile void *addr);
>  
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc(uint32_t value, volatile void *addr);
> +
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * The relaxed version does not have additional I/O memory barrier, useful in
> + * accessing the device registers of integrated controllers which implicitly
> + * strongly ordered with respect to memory access.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
> +
>  #endif /* __DOXYGEN__ */
>  
>  #ifndef RTE_OVERRIDE_IO_H
> @@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
>  	rte_write64_relaxed(value, addr);
>  }
>  
> +#ifndef RTE_NATIVE_WRITE32_WC
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +	rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +	rte_write32_relaxed(value, addr);
> +}
> +#endif /* RTE_NATIVE_WRITE32_WC */
> +
> +
>  #endif /* RTE_OVERRIDE_IO_H */
>  

I like this approach, since it saves duplicating the non-overridden
functions. Nice!

>  #endif /* _RTE_IO_H_ */
> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..5efbf0d 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -9,8 +9,67 @@
>  extern "C" {
>  #endif
>  
> +#include "rte_cpuflags.h"
> +
> +#define RTE_NATIVE_WRITE32_WC
>  #include "generic/rte_io.h"
>  
> +/**
> + * @internal
> + * MOVDIRI wrapper.
> + */
> +static __rte_always_inline void
> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
> +{
> +	asm volatile(
> +		/* MOVDIRI */
> +		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +		:
> +		: "a" (value), "d" (addr));
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +	static int _x86_movdiri_flag = -1;
> +	if (_x86_movdiri_flag == 1) {
> +		rte_wmb();
> +		_rte_x86_movdiri(value, addr);
> +	} else if (_x86_movdiri_flag == 0) {
> +		rte_write32(value, addr);
> +	} else {
> +		_x86_movdiri_flag =
> +			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +		if (_x86_movdiri_flag == 1) {
> +			rte_wmb();
> +			_rte_x86_movdiri(value, addr);
> +		} else {
> +			rte_write32(value, addr);
> +		}
> +	}
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +	static int _x86_movdiri_flag = -1;
> +	if (_x86_movdiri_flag == 1) {
> +		_rte_x86_movdiri(value, addr);
> +	} else if (_x86_movdiri_flag == 0) {
> +		rte_write32_relaxed(value, addr);
> +	} else {
> +		_x86_movdiri_flag =
> +			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +		if (_x86_movdiri_flag == 1)
> +			_rte_x86_movdiri(value, addr);
> +		else
> +			rte_write32_relaxed(value, addr);
> +	}
> +}
> +
> +
> +
> +

Rather a lot of whitespace here.

>  #ifdef __cplusplus
>  }
>  #endif
> -- 
> 2.7.4

With the nits called out above fixed:

Acked-by: Bruce Richardson <bruce.richarson@intel.com>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/2] net/i40e: use WC store to update queue tail registers
  2020-06-19 12:06   ` [dpdk-dev] [PATCH v2 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-01 13:15     ` Bruce Richardson
  0 siblings, 0 replies; 76+ messages in thread
From: Bruce Richardson @ 2020-07-01 13:15 UTC (permalink / raw)
  To: Radu Nicolau; +Cc: dev, beilei.xing, jia.guo, konstantin.ananyev, jerinjacobk

On Fri, Jun 19, 2020 at 01:06:20PM +0100, Radu Nicolau wrote:
> Performance improvement: use a write combining store
> instead of a regular mmio write to update queue tail
> registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> ---
Acked-by: Bruce Richardson <bruce.richardson@intel.com>

I assume there are plans to update other drivers similarly in future?

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v3 0/2] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (3 preceding siblings ...)
  2020-06-19 12:06 ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Radu Nicolau
@ 2020-07-01 14:15 ` Radu Nicolau
  2020-07-01 14:15   ` [dpdk-dev] [PATCH v3 1/2] " Radu Nicolau
  2020-07-01 14:15   ` [dpdk-dev] [PATCH v3 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
  2020-07-02  9:23 ` [dpdk-dev] [PATCH v4 0/2] eal: add WC store functions Radu Nicolau
                   ` (8 subsequent siblings)
  13 siblings, 2 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-01 14:15 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Implement 2 new functions that will enable write combining stores
depending on architecture. The functions are provided as a generic
stub and a x86 specific implementation.

The reason to implement these functions is to improve performance by
reducing the overhead associated with regular mmio writes when updating
the hardware queue tails and doorbells.

With this patch set the I40E PMD is updated to use the write combining
store functions with other PMDs to follow.

Radu Nicolau (2):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers

 drivers/net/i40e/base/i40e_osdep.h      |  6 ++++
 drivers/net/i40e/i40e_rxtx.c            |  8 ++---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   |  4 +--
 drivers/net/i40e/i40e_rxtx_vec_sse.c    |  4 +--
 lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 6 files changed, 117 insertions(+), 8 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v3 1/2] eal: add WC store functions
  2020-07-01 14:15 ` [dpdk-dev] [PATCH v3 0/2] " Radu Nicolau
@ 2020-07-01 14:15   ` Radu Nicolau
  2020-07-01 14:15   ` [dpdk-dev] [PATCH v3 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
  1 sibling, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-01 14:15 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..7391782 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,39 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v3 2/2] net/i40e: use WC store to update queue tail registers
  2020-07-01 14:15 ` [dpdk-dev] [PATCH v3 0/2] " Radu Nicolau
  2020-07-01 14:15   ` [dpdk-dev] [PATCH v3 1/2] " Radu Nicolau
@ 2020-07-01 14:15   ` Radu Nicolau
  1 sibling, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-01 14:15 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 6 ++++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..df414fd 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,12 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..64e43ac 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v4 0/2] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (4 preceding siblings ...)
  2020-07-01 14:15 ` [dpdk-dev] [PATCH v3 0/2] " Radu Nicolau
@ 2020-07-02  9:23 ` Radu Nicolau
  2020-07-02  9:23   ` [dpdk-dev] [PATCH v4 1/2] " Radu Nicolau
  2020-07-02  9:23   ` [dpdk-dev] [PATCH v4 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
  2020-07-06 12:29 ` [dpdk-dev] [PATCH v5 0/2] eal: add WC store functions Radu Nicolau
                   ` (7 subsequent siblings)
  13 siblings, 2 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-02  9:23 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E PMD is updated to use the write
combining store functions with other PMDs to follow.


Radu Nicolau (2):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers

 drivers/net/i40e/base/i40e_osdep.h      |  6 ++++
 drivers/net/i40e/i40e_rxtx.c            |  8 ++---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   |  4 +--
 drivers/net/i40e/i40e_rxtx_vec_sse.c    |  4 +--
 lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 6 files changed, 117 insertions(+), 8 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v4 1/2] eal: add WC store functions
  2020-07-02  9:23 ` [dpdk-dev] [PATCH v4 0/2] eal: add WC store functions Radu Nicolau
@ 2020-07-02  9:23   ` Radu Nicolau
  2020-07-03 15:19     ` David Marchand
  2020-07-02  9:23   ` [dpdk-dev] [PATCH v4 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
  1 sibling, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-07-02  9:23 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v4: address feedback and include ack

 lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..7391782 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,39 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v4 2/2] net/i40e: use WC store to update queue tail registers
  2020-07-02  9:23 ` [dpdk-dev] [PATCH v4 0/2] eal: add WC store functions Radu Nicolau
  2020-07-02  9:23   ` [dpdk-dev] [PATCH v4 1/2] " Radu Nicolau
@ 2020-07-02  9:23   ` Radu Nicolau
  1 sibling, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-02  9:23 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v4: include ack

 drivers/net/i40e/base/i40e_osdep.h    | 6 ++++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..df414fd 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,12 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..64e43ac 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/2] eal: add WC store functions
  2020-07-02  9:23   ` [dpdk-dev] [PATCH v4 1/2] " Radu Nicolau
@ 2020-07-03 15:19     ` David Marchand
  2020-07-06  9:15       ` Nicolau, Radu
  0 siblings, 1 reply; 76+ messages in thread
From: David Marchand @ 2020-07-03 15:19 UTC (permalink / raw)
  To: Radu Nicolau
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob

On Thu, Jul 2, 2020 at 11:24 AM Radu Nicolau <radu.nicolau@intel.com> wrote:
>
> Add rte_write32_wc and rte_write32_wc_relaxed functions
> that implement 32bit stores using write combining memory protocol.
> Provided generic stubs and x86 implementation.
>
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
> v4: address feedback and include ack
>
>  lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
>  lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
>  2 files changed, 103 insertions(+)
>
> diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
> index da457f7..7391782 100644
> --- a/lib/librte_eal/include/generic/rte_io.h
> +++ b/lib/librte_eal/include/generic/rte_io.h
> @@ -229,6 +229,39 @@ rte_write32(uint32_t value, volatile void *addr);
>  static inline void
>  rte_write64(uint64_t value, volatile void *addr);
>
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc(uint32_t value, volatile void *addr);

This is a new API, and even if inlined, it should be marked experimental.

Is volatile necessary?


> +
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * The relaxed version does not have additional I/O memory barrier, useful in
> + * accessing the device registers of integrated controllers which implicitly
> + * strongly ordered with respect to memory access.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
> +

Double empty line (there are some other in this patch that I won't flag again).


>  #endif /* __DOXYGEN__ */
>
>  #ifndef RTE_OVERRIDE_IO_H
> @@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
>         rte_write64_relaxed(value, addr);
>  }
>
> +#ifndef RTE_NATIVE_WRITE32_WC

Missing return type, this causes build failure on anything but x86.


> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       rte_write32_relaxed(value, addr);
> +}
> +#endif /* RTE_NATIVE_WRITE32_WC */
> +
> +
>  #endif /* RTE_OVERRIDE_IO_H */
>
>  #endif /* _RTE_IO_H_ */
> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..c95ed67 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -9,8 +9,64 @@
>  extern "C" {
>  #endif
>
> +#include "rte_cpuflags.h"

Inclusion of this header should be out of the extern "C" block.


> +
> +#define RTE_NATIVE_WRITE32_WC
>  #include "generic/rte_io.h"
>
> +/**
> + * @internal
> + * MOVDIRI wrapper.
> + */
> +static __rte_always_inline void
> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
> +{
> +       asm volatile(
> +               /* MOVDIRI */
> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +               :
> +               : "a" (value), "d" (addr));
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;
> +       if (_x86_movdiri_flag == 1) {
> +               rte_wmb();
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);

Can't this cpu flag check be moved in a constructor?
This would avoid this copy/paste.


> +               if (_x86_movdiri_flag == 1) {
> +                       rte_wmb();
> +                       _rte_x86_movdiri(value, addr);
> +               } else {
> +                       rte_write32(value, addr);
> +               }
> +       }
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;

Same check with a static variable with the same name.


I wonder if wrapping all of this in a single function would be more elegant.
Then rte_write32_wc(|_relaxed) would call it with a flag.


> +       if (_x86_movdiri_flag == 1) {
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32_relaxed(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +               if (_x86_movdiri_flag == 1)
> +                       _rte_x86_movdiri(value, addr);
> +               else
> +                       rte_write32_relaxed(value, addr);
> +       }
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.7.4
>



-- 
David Marchand


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v4 1/2] eal: add WC store functions
  2020-07-03 15:19     ` David Marchand
@ 2020-07-06  9:15       ` Nicolau, Radu
  0 siblings, 0 replies; 76+ messages in thread
From: Nicolau, Radu @ 2020-07-06  9:15 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob

Hi David, thanks for reviewing!

Some comments inline.


On 7/3/2020 4:19 PM, David Marchand wrote:
> On Thu, Jul 2, 2020 at 11:24 AM Radu Nicolau<radu.nicolau@intel.com>  wrote:
>> +static inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr);
> This is a new API, and even if inlined, it should be marked experimental.
Will do.
> Is volatile necessary?
Yes, most of these functions will be called on mmio addresses/volatile 
pointers. All other io functions have it.
>
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
> +
> Double empty line (there are some other in this patch that I won't flag again).
I will check all occurrences.
>
>
>>   #endif /* __DOXYGEN__ */
>>
>>   #ifndef RTE_OVERRIDE_IO_H
>> @@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
>>          rte_write64_relaxed(value, addr);
>>   }
>>
>> +#ifndef RTE_NATIVE_WRITE32_WC
> Missing return type, this causes build failure on anything but x86.
Yes, got lost in the copy/paste. I will fix it in the next version
>
>
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32(value, addr);
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32_relaxed(value, addr);
>> +}
>> +#endif /* RTE_NATIVE_WRITE32_WC */
>> +
>> +
>>   #endif /* RTE_OVERRIDE_IO_H */
>>
>>   #endif /* _RTE_IO_H_ */
>> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
>> index 2db71b1..c95ed67 100644
>> --- a/lib/librte_eal/x86/include/rte_io.h
>> +++ b/lib/librte_eal/x86/include/rte_io.h
>> @@ -9,8 +9,64 @@
>>   extern "C" {
>>   #endif
>>
>> +#include "rte_cpuflags.h"
> Inclusion of this header should be out of the extern "C" block.
Why? It is used elsewhere inside the extern "C" block e.g. 
x86/rte_spinlock.h
>
>
>> +
>> +#define RTE_NATIVE_WRITE32_WC
>>   #include "generic/rte_io.h"
>>
>> +/**
>> + * @internal
>> + * MOVDIRI wrapper.
>> + */
>> +static __rte_always_inline void
>> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
>> +{
>> +       asm volatile(
>> +               /* MOVDIRI */
>> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
>> +               :
>> +               : "a" (value), "d" (addr));
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
>> +       if (_x86_movdiri_flag == 1) {
>> +               rte_wmb();
>> +               _rte_x86_movdiri(value, addr);
>> +       } else if (_x86_movdiri_flag == 0) {
>> +               rte_write32(value, addr);
>> +       } else {
>> +               _x86_movdiri_flag =
>> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> Can't this cpu flag check be moved in a constructor?
> This would avoid this copy/paste.
We evaluated this approach but it creates more problems than if fixes - 
it will need a variable that needs to be exported and there is no good 
place to put it.
>
>
>> +               if (_x86_movdiri_flag == 1) {
>> +                       rte_wmb();
>> +                       _rte_x86_movdiri(value, addr);
>> +               } else {
>> +                       rte_write32(value, addr);
>> +               }
>> +       }
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
> Same check with a static variable with the same name.
It should be no problem, they are static local variables.
>
>
> I wonder if wrapping all of this in a single function would be more elegant.
> Then rte_write32_wc(|_relaxed) would call it with a flag.
Yes, it will be more elegant but also it will cost more, it was written 
like this to minimize the number of branches taken for the movdiri path.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v5 0/2] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (5 preceding siblings ...)
  2020-07-02  9:23 ` [dpdk-dev] [PATCH v4 0/2] eal: add WC store functions Radu Nicolau
@ 2020-07-06 12:29 ` Radu Nicolau
  2020-07-06 12:29   ` [dpdk-dev] [PATCH v5 1/2] " Radu Nicolau
  2020-07-06 12:30   ` [dpdk-dev] [PATCH v5 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
  2020-07-13 12:27 ` [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions Radu Nicolau
                   ` (6 subsequent siblings)
  13 siblings, 2 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-06 12:29 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E PMD is updated to use the write
combining store functions with other PMDs to follow.

Radu Nicolau (2):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers

 drivers/net/i40e/base/i40e_osdep.h      |  5 +++
 drivers/net/i40e/i40e_rxtx.c            |  8 ++---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   |  4 +--
 drivers/net/i40e/i40e_rxtx_vec_sse.c    |  4 +--
 lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 6 files changed, 117 insertions(+), 8 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v5 1/2] eal: add WC store functions
  2020-07-06 12:29 ` [dpdk-dev] [PATCH v5 0/2] eal: add WC store functions Radu Nicolau
@ 2020-07-06 12:29   ` Radu Nicolau
  2020-07-06 12:30   ` [dpdk-dev] [PATCH v5 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
  1 sibling, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-06 12:29 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v5: small fixes after feedback
 lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..0669baa 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v5 2/2] net/i40e: use WC store to update queue tail registers
  2020-07-06 12:29 ` [dpdk-dev] [PATCH v5 0/2] eal: add WC store functions Radu Nicolau
  2020-07-06 12:29   ` [dpdk-dev] [PATCH v5 1/2] " Radu Nicolau
@ 2020-07-06 12:30   ` Radu Nicolau
  1 sibling, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-06 12:30 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v5: small fixes after feedback

 drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..69ab717 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..64e43ac 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (6 preceding siblings ...)
  2020-07-06 12:29 ` [dpdk-dev] [PATCH v5 0/2] eal: add WC store functions Radu Nicolau
@ 2020-07-13 12:27 ` Radu Nicolau
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 1/4] " Radu Nicolau
                     ` (3 more replies)
  2020-07-16 12:29 ` [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions Radu Nicolau
                   ` (5 subsequent siblings)
  13 siblings, 4 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-13 12:27 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E, IXGBE and QAT PMDs are updated to
use the write combining store functions with other PMDs to follow.


Radu Nicolau (4):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers
  qat: use WC store to update queue tail registers
  net/ixgbe: use WC store to update doorbell register

 .../qat/qat_adf/adf_transport_access_macros.h      |  6 ++-
 drivers/net/i40e/base/i40e_osdep.h                 |  5 ++
 drivers/net/i40e/i40e_rxtx.c                       |  8 ++--
 drivers/net/i40e/i40e_rxtx_vec_avx2.c              |  4 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c               |  4 +-
 drivers/net/ixgbe/base/ixgbe_osdep.h               |  6 +++
 drivers/net/ixgbe/ixgbe_rxtx.c                     |  4 +-
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c             |  4 +-
 lib/librte_eal/include/generic/rte_io.h            | 48 +++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h                | 56 ++++++++++++++++++++++
 10 files changed, 131 insertions(+), 14 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v6 1/4] eal: add WC store functions
  2020-07-13 12:27 ` [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-13 12:27   ` Radu Nicolau
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-13 12:27 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v6: add QAT and IXGBE updates

 lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..0669baa 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v6 2/4] net/i40e: use WC store to update queue tail registers
  2020-07-13 12:27 ` [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions Radu Nicolau
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 1/4] " Radu Nicolau
@ 2020-07-13 12:27   ` Radu Nicolau
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 3/4] qat: " Radu Nicolau
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 4/4] net/ixgbe: use WC store to update doorbell register Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-13 12:27 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..69ab717 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..64e43ac 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v6 3/4] qat: use WC store to update queue tail registers
  2020-07-13 12:27 ` [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions Radu Nicolau
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 1/4] " Radu Nicolau
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-13 12:27   ` Radu Nicolau
  2020-07-13 12:44     ` Bruce Richardson
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 4/4] net/ixgbe: use WC store to update doorbell register Radu Nicolau
  3 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-07-13 12:27 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/common/qat/qat_adf/adf_transport_access_macros.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/common/qat/qat_adf/adf_transport_access_macros.h b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
index 1eef551..504ffb7 100644
--- a/drivers/common/qat/qat_adf/adf_transport_access_macros.h
+++ b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
@@ -9,6 +9,8 @@
 /* CSR write macro */
 #define ADF_CSR_WR(csrAddr, csrOffset, val)		\
 	rte_write32(val, (((uint8_t *)csrAddr) + csrOffset))
+#define ADF_CSR_WC_WR(csrAddr, csrOffset, val)		\
+	rte_write32_wc(val, (((uint8_t *)csrAddr) + csrOffset))
 
 /* CSR read macro */
 #define ADF_CSR_RD(csrAddr, csrOffset)			\
@@ -110,10 +112,10 @@ do { \
 		ADF_RING_CSR_RING_UBASE + (ring << 2), u_base);	\
 } while (0)
 #define WRITE_CSR_RING_HEAD(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_HEAD + (ring << 2), value)
 #define WRITE_CSR_RING_TAIL(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_TAIL + (ring << 2), value)
 #define WRITE_CSR_INT_SRCSEL(csr_base_addr, bank) \
 do { \
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v6 4/4] net/ixgbe: use WC store to update doorbell register
  2020-07-13 12:27 ` [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions Radu Nicolau
                     ` (2 preceding siblings ...)
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 3/4] qat: " Radu Nicolau
@ 2020-07-13 12:27   ` Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-13 12:27 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/net/ixgbe/base/ixgbe_osdep.h   | 6 ++++++
 drivers/net/ixgbe/ixgbe_rxtx.c         | 4 ++--
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 4 ++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index dc712b7..cacf724 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
 #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 2e20e18..06e8dd5 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca31..e77a7f3 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
 #ifdef RTE_LIBRTE_SECURITY
@@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/4] qat: use WC store to update queue tail registers
  2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 3/4] qat: " Radu Nicolau
@ 2020-07-13 12:44     ` Bruce Richardson
  2020-07-13 12:52       ` Trahe, Fiona
  0 siblings, 1 reply; 76+ messages in thread
From: Bruce Richardson @ 2020-07-13 12:44 UTC (permalink / raw)
  To: Radu Nicolau
  Cc: dev, beilei.xing, jia.guo, konstantin.ananyev, jerinjacobk,
	david.marchand, fiona.trahe, wei.zhao1

On Mon, Jul 13, 2020 at 01:27:26PM +0100, Radu Nicolau wrote:
> Performance improvement: use a write combining store
> instead of a regular mmio write to update queue tail
> registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> ---
minor nit, I think the subject prefix on the patch should be crypto/qat.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/4] qat: use WC store to update queue tail registers
  2020-07-13 12:44     ` Bruce Richardson
@ 2020-07-13 12:52       ` Trahe, Fiona
  2020-07-13 12:57         ` Bruce Richardson
  0 siblings, 1 reply; 76+ messages in thread
From: Trahe, Fiona @ 2020-07-13 12:52 UTC (permalink / raw)
  To: Richardson, Bruce, Nicolau, Radu
  Cc: dev, Xing, Beilei, Guo, Jia, Ananyev, Konstantin, jerinjacobk,
	david.marchand, Zhao1, Wei



> -----Original Message-----
> From: Bruce Richardson <bruce.richardson@intel.com>
> Sent: Monday, July 13, 2020 1:45 PM
> To: Nicolau, Radu <radu.nicolau@intel.com>
> Cc: dev@dpdk.org; Xing, Beilei <beilei.xing@intel.com>; Guo, Jia <jia.guo@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; jerinjacobk@gmail.com; david.marchand@redhat.com; Trahe, Fiona
> <fiona.trahe@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>
> Subject: Re: [PATCH v6 3/4] qat: use WC store to update queue tail registers
> 
> On Mon, Jul 13, 2020 at 01:27:26PM +0100, Radu Nicolau wrote:
> > Performance improvement: use a write combining store
> > instead of a regular mmio write to update queue tail
> > registers.
> >
> > Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> > ---
> minor nit, I think the subject prefix on the patch should be crypto/qat.
when it affects all qat PMDs, i.e. crypto sym, crypto asym and compression, as
this does we usually use drivers/qat

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/4] qat: use WC store to update queue tail registers
  2020-07-13 12:52       ` Trahe, Fiona
@ 2020-07-13 12:57         ` Bruce Richardson
  0 siblings, 0 replies; 76+ messages in thread
From: Bruce Richardson @ 2020-07-13 12:57 UTC (permalink / raw)
  To: Trahe, Fiona
  Cc: Nicolau, Radu, dev, Xing, Beilei, Guo, Jia, Ananyev, Konstantin,
	jerinjacobk, david.marchand, Zhao1, Wei

On Mon, Jul 13, 2020 at 01:52:24PM +0100, Trahe, Fiona wrote:
> 
> 
> > -----Original Message-----
> > From: Bruce Richardson <bruce.richardson@intel.com>
> > Sent: Monday, July 13, 2020 1:45 PM
> > To: Nicolau, Radu <radu.nicolau@intel.com>
> > Cc: dev@dpdk.org; Xing, Beilei <beilei.xing@intel.com>; Guo, Jia <jia.guo@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; jerinjacobk@gmail.com; david.marchand@redhat.com; Trahe, Fiona
> > <fiona.trahe@intel.com>; Zhao1, Wei <wei.zhao1@intel.com>
> > Subject: Re: [PATCH v6 3/4] qat: use WC store to update queue tail registers
> >
> > On Mon, Jul 13, 2020 at 01:27:26PM +0100, Radu Nicolau wrote:
> > > Performance improvement: use a write combining store
> > > instead of a regular mmio write to update queue tail
> > > registers.
> > >
> > > Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> > > ---
> > minor nit, I think the subject prefix on the patch should be crypto/qat.
> when it affects all qat PMDs, i.e. crypto sym, crypto asym and compression, as
> this does we usually use drivers/qat

Ah, yes, sorry, I forgot that there was more than one qat directory in
drivers. In this case, the changes are in common only, so common/qat would
seem to be the right prefix. Not that it matters to the patch
functionality, mind... :-)

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (7 preceding siblings ...)
  2020-07-13 12:27 ` [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-16 12:29 ` Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 1/4] " Radu Nicolau
                     ` (3 more replies)
  2020-07-17 10:49 ` [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions Radu Nicolau
                   ` (4 subsequent siblings)
  13 siblings, 4 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-16 12:29 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E, IXGBE and QAT PMDs are updated to
use the write combining store functions with other PMDs to follow.

Radu Nicolau (4):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers
  common/qat: use WC store to update queue tail registers
  net/ixgbe: use WC store to update doorbell register

 .../qat/qat_adf/adf_transport_access_macros.h      |  6 ++-
 drivers/net/i40e/base/i40e_osdep.h                 |  5 ++
 drivers/net/i40e/i40e_rxtx.c                       |  8 ++--
 drivers/net/i40e/i40e_rxtx_vec_avx2.c              |  4 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c               |  4 +-
 drivers/net/ixgbe/base/ixgbe_osdep.h               |  6 +++
 drivers/net/ixgbe/ixgbe_rxtx.c                     |  4 +-
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c             |  4 +-
 lib/librte_eal/arm/include/rte_io_64.h             | 12 +++++
 lib/librte_eal/include/generic/rte_io.h            | 48 +++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h                | 56 ++++++++++++++++++++++
 11 files changed, 143 insertions(+), 14 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v7 1/4] eal: add WC store functions
  2020-07-16 12:29 ` [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-16 12:29   ` Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-16 12:29 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v7: fix ARM64 build

 lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
 lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
index e534624..d07d9cb 100644
--- a/lib/librte_eal/arm/include/rte_io_64.h
+++ b/lib/librte_eal/arm/include/rte_io_64.h
@@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..0669baa 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v7 2/4] net/i40e: use WC store to update queue tail registers
  2020-07-16 12:29 ` [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 1/4] " Radu Nicolau
@ 2020-07-16 12:29   ` Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 3/4] common/qat: " Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 4/4] net/ixgbe: use WC store to update doorbell register Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-16 12:29 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..69ab717 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..64e43ac 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v7 3/4] common/qat: use WC store to update queue tail registers
  2020-07-16 12:29 ` [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 1/4] " Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-16 12:29   ` Radu Nicolau
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 4/4] net/ixgbe: use WC store to update doorbell register Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-16 12:29 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/common/qat/qat_adf/adf_transport_access_macros.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/common/qat/qat_adf/adf_transport_access_macros.h b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
index 1eef551..504ffb7 100644
--- a/drivers/common/qat/qat_adf/adf_transport_access_macros.h
+++ b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
@@ -9,6 +9,8 @@
 /* CSR write macro */
 #define ADF_CSR_WR(csrAddr, csrOffset, val)		\
 	rte_write32(val, (((uint8_t *)csrAddr) + csrOffset))
+#define ADF_CSR_WC_WR(csrAddr, csrOffset, val)		\
+	rte_write32_wc(val, (((uint8_t *)csrAddr) + csrOffset))
 
 /* CSR read macro */
 #define ADF_CSR_RD(csrAddr, csrOffset)			\
@@ -110,10 +112,10 @@ do { \
 		ADF_RING_CSR_RING_UBASE + (ring << 2), u_base);	\
 } while (0)
 #define WRITE_CSR_RING_HEAD(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_HEAD + (ring << 2), value)
 #define WRITE_CSR_RING_TAIL(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_TAIL + (ring << 2), value)
 #define WRITE_CSR_INT_SRCSEL(csr_base_addr, bank) \
 do { \
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v7 4/4] net/ixgbe: use WC store to update doorbell register
  2020-07-16 12:29 ` [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions Radu Nicolau
                     ` (2 preceding siblings ...)
  2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 3/4] common/qat: " Radu Nicolau
@ 2020-07-16 12:29   ` Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-16 12:29 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/net/ixgbe/base/ixgbe_osdep.h   | 6 ++++++
 drivers/net/ixgbe/ixgbe_rxtx.c         | 4 ++--
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c | 4 ++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index dc712b7..cacf724 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
 #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 2e20e18..06e8dd5 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca31..e77a7f3 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
 #ifdef RTE_LIBRTE_SECURITY
@@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (8 preceding siblings ...)
  2020-07-16 12:29 ` [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-17 10:49 ` Radu Nicolau
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 1/4] " Radu Nicolau
                     ` (3 more replies)
  2020-07-20  9:12 ` [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions Radu Nicolau
                   ` (3 subsequent siblings)
  13 siblings, 4 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-17 10:49 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E, IXGBE and QAT PMDs are updated to
use the write combining store functions with other PMDs to follow.

Radu Nicolau (4):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers
  common/qat: use WC store to update queue tail registers
  net/ixgbe: use WC store to update queue tail registers

 .../qat/qat_adf/adf_transport_access_macros.h      |  6 ++-
 drivers/net/i40e/base/i40e_osdep.h                 |  5 ++
 drivers/net/i40e/i40e_rxtx.c                       |  8 ++--
 drivers/net/i40e/i40e_rxtx_vec_avx2.c              |  4 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c               |  4 +-
 drivers/net/ixgbe/base/ixgbe_osdep.h               |  6 +++
 drivers/net/ixgbe/ixgbe_rxtx.c                     | 12 ++---
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c             |  4 +-
 lib/librte_eal/arm/include/rte_io_64.h             | 12 +++++
 lib/librte_eal/include/generic/rte_io.h            | 48 +++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h                | 56 ++++++++++++++++++++++
 11 files changed, 147 insertions(+), 18 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v8 1/4] eal: add WC store functions
  2020-07-17 10:49 ` [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-17 10:49   ` Radu Nicolau
  2020-07-20  6:42     ` Ruifeng Wang
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-07-17 10:49 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
 lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
index e534624..d07d9cb 100644
--- a/lib/librte_eal/arm/include/rte_io_64.h
+++ b/lib/librte_eal/arm/include/rte_io_64.h
@@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..0669baa 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers
  2020-07-17 10:49 ` [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions Radu Nicolau
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 1/4] " Radu Nicolau
@ 2020-07-17 10:49   ` Radu Nicolau
  2020-07-20  6:46     ` Ruifeng Wang
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 3/4] common/qat: " Radu Nicolau
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 4/4] net/ixgbe: " Radu Nicolau
  3 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-07-17 10:49 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..69ab717 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..64e43ac 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..294c1c4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..a4635e0 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v8 3/4] common/qat: use WC store to update queue tail registers
  2020-07-17 10:49 ` [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions Radu Nicolau
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 1/4] " Radu Nicolau
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-17 10:49   ` Radu Nicolau
  2020-07-17 16:42     ` Trahe, Fiona
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 4/4] net/ixgbe: " Radu Nicolau
  3 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-07-17 10:49 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/common/qat/qat_adf/adf_transport_access_macros.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/common/qat/qat_adf/adf_transport_access_macros.h b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
index 1eef551..504ffb7 100644
--- a/drivers/common/qat/qat_adf/adf_transport_access_macros.h
+++ b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
@@ -9,6 +9,8 @@
 /* CSR write macro */
 #define ADF_CSR_WR(csrAddr, csrOffset, val)		\
 	rte_write32(val, (((uint8_t *)csrAddr) + csrOffset))
+#define ADF_CSR_WC_WR(csrAddr, csrOffset, val)		\
+	rte_write32_wc(val, (((uint8_t *)csrAddr) + csrOffset))
 
 /* CSR read macro */
 #define ADF_CSR_RD(csrAddr, csrOffset)			\
@@ -110,10 +112,10 @@ do { \
 		ADF_RING_CSR_RING_UBASE + (ring << 2), u_base);	\
 } while (0)
 #define WRITE_CSR_RING_HEAD(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_HEAD + (ring << 2), value)
 #define WRITE_CSR_RING_TAIL(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_TAIL + (ring << 2), value)
 #define WRITE_CSR_INT_SRCSEL(csr_base_addr, bank) \
 do { \
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v8 4/4] net/ixgbe: use WC store to update queue tail registers
  2020-07-17 10:49 ` [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions Radu Nicolau
                     ` (2 preceding siblings ...)
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 3/4] common/qat: " Radu Nicolau
@ 2020-07-17 10:49   ` Radu Nicolau
  2020-07-17 11:18     ` Ananyev, Konstantin
  3 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-07-17 10:49 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/net/ixgbe/base/ixgbe_osdep.h   |  6 ++++++
 drivers/net/ixgbe/ixgbe_rxtx.c         | 12 ++++++------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c |  4 ++--
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index dc712b7..cacf724 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
 #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 2e20e18..669b23e 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -946,7 +946,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1692,7 +1692,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		/* update tail pointer */
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
 					    cur_free_trigger);
 	}
 
@@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2096,7 +2096,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 
 			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
 				rte_wmb();
-				IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+				IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
 							    next_rdt);
 				nb_hold -= rxq->rx_free_thresh;
 			} else {
@@ -2262,7 +2262,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
 
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
 		nb_hold = 0;
 	}
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca31..e77a7f3 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
 #ifdef RTE_LIBRTE_SECURITY
@@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v8 4/4] net/ixgbe: use WC store to update queue tail registers
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 4/4] net/ixgbe: " Radu Nicolau
@ 2020-07-17 11:18     ` Ananyev, Konstantin
  0 siblings, 0 replies; 76+ messages in thread
From: Ananyev, Konstantin @ 2020-07-17 11:18 UTC (permalink / raw)
  To: Nicolau, Radu, dev
  Cc: Xing, Beilei, Guo, Jia, Richardson, Bruce, jerinjacobk,
	david.marchand, Trahe, Fiona, Zhao1, Wei, ruifeng.wang


> Performance improvement: use a write combining store
> instead of a regular mmio write to update queue tail
> registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> ---
>  drivers/net/ixgbe/base/ixgbe_osdep.h   |  6 ++++++
>  drivers/net/ixgbe/ixgbe_rxtx.c         | 12 ++++++------
>  drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c |  4 ++--
>  3 files changed, 14 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
> index dc712b7..cacf724 100644
> --- a/drivers/net/ixgbe/base/ixgbe_osdep.h
> +++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
> @@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
>  #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
>  	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
> 
> +#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
> +	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
> +
> +#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
> +	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
> +
>  #define IXGBE_PCI_REG_ADDR(hw, reg) \
>  	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
> 
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
> index 2e20e18..669b23e 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx.c
> @@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> 
>  	/* update tail pointer */
>  	rte_wmb();
> -	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
> +	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
> 
>  	return nb_pkts;
>  }
> @@ -946,7 +946,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
>  	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
>  		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
>  		   (unsigned) tx_id, (unsigned) nb_tx);
> -	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
> +	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
>  	txq->tx_tail = tx_id;
> 
>  	return nb_tx;
> @@ -1692,7 +1692,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
> 
>  		/* update tail pointer */
>  		rte_wmb();
> -		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
> +		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
>  					    cur_free_trigger);
>  	}
> 
> @@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
>  			   (unsigned) nb_rx);
>  		rx_id = (uint16_t) ((rx_id == 0) ?
>  				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
> -		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
> +		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
>  		nb_hold = 0;
>  	}
>  	rxq->nb_rx_hold = nb_hold;
> @@ -2096,7 +2096,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
> 
>  			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
>  				rte_wmb();
> -				IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
> +				IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
>  							    next_rdt);
>  				nb_hold -= rxq->rx_free_thresh;
>  			} else {
> @@ -2262,7 +2262,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
>  			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
> 
>  		rte_wmb();
> -		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
> +		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
>  		nb_hold = 0;
>  	}
> 
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
> index 517ca31..e77a7f3 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
> @@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
>  			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
> 
>  	/* Update the tail pointer on the NIC */
> -	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
> +	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
>  }
> 
>  #ifdef RTE_LIBRTE_SECURITY
> @@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
> 
>  	txq->tx_tail = tx_id;
> 
> -	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
> +	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
> 
>  	return nb_pkts;
>  }
> --

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> 2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v8 3/4] common/qat: use WC store to update queue tail registers
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 3/4] common/qat: " Radu Nicolau
@ 2020-07-17 16:42     ` Trahe, Fiona
  0 siblings, 0 replies; 76+ messages in thread
From: Trahe, Fiona @ 2020-07-17 16:42 UTC (permalink / raw)
  To: Nicolau, Radu, dev
  Cc: Xing, Beilei, Guo, Jia, Richardson, Bruce, Ananyev, Konstantin,
	jerinjacobk, david.marchand, Zhao1, Wei, ruifeng.wang, Trahe,
	Fiona



> -----Original Message-----
> From: Nicolau, Radu <radu.nicolau@intel.com>
> Sent: Friday, July 17, 2020 11:50 AM
> To: dev@dpdk.org
> Cc: Xing, Beilei <beilei.xing@intel.com>; Guo, Jia <jia.guo@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> jerinjacobk@gmail.com; david.marchand@redhat.com; Trahe, Fiona <fiona.trahe@intel.com>; Zhao1,
> Wei <wei.zhao1@intel.com>; ruifeng.wang@arm.com; Nicolau, Radu <radu.nicolau@intel.com>
> Subject: [PATCH v8 3/4] common/qat: use WC store to update queue tail registers
> 
> Performance improvement: use a write combining store
> instead of a regular mmio write to update queue tail
> registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Fiona Trahe <fiona.trahe@intel.com>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/4] eal: add WC store functions
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 1/4] " Radu Nicolau
@ 2020-07-20  6:42     ` Ruifeng Wang
  2020-07-20  8:52       ` Nicolau, Radu
  0 siblings, 1 reply; 76+ messages in thread
From: Ruifeng Wang @ 2020-07-20  6:42 UTC (permalink / raw)
  To: Radu Nicolau, dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1, nd


> -----Original Message-----
> From: Radu Nicolau <radu.nicolau@intel.com>
> Sent: Friday, July 17, 2020 6:50 PM
> To: dev@dpdk.org
> Cc: beilei.xing@intel.com; jia.guo@intel.com; bruce.richardson@intel.com;
> konstantin.ananyev@intel.com; jerinjacobk@gmail.com;
> david.marchand@redhat.com; fiona.trahe@intel.com; wei.zhao1@intel.com;
> Ruifeng Wang <Ruifeng.Wang@arm.com>; Radu Nicolau
> <radu.nicolau@intel.com>
> Subject: [PATCH v8 1/4] eal: add WC store functions
> 
> Add rte_write32_wc and rte_write32_wc_relaxed functions that implement
> 32bit stores using write combining memory protocol.
> Provided generic stubs and x86 implementation.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
> lib/librte_eal/include/generic/rte_io.h | 48
> ++++++++++++++++++++++++++++
>  lib/librte_eal/x86/include/rte_io.h     | 56
> +++++++++++++++++++++++++++++++++
>  3 files changed, 116 insertions(+)
> 
> diff --git a/lib/librte_eal/arm/include/rte_io_64.h
> b/lib/librte_eal/arm/include/rte_io_64.h
> index e534624..d07d9cb 100644
> --- a/lib/librte_eal/arm/include/rte_io_64.h
> +++ b/lib/librte_eal/arm/include/rte_io_64.h
> @@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
>  	rte_write64_relaxed(value, addr);
>  }
> 
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr) {
> +	rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr) {
> +	rte_write32_relaxed(value, addr);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_io.h
> b/lib/librte_eal/include/generic/rte_io.h
> index da457f7..0669baa 100644
> --- a/lib/librte_eal/include/generic/rte_io.h
> +++ b/lib/librte_eal/include/generic/rte_io.h
> @@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
> static inline void  rte_write64(uint64_t value, volatile void *addr);
> 
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write
> +combining
> + * may not be available and/or may be treated as a hint and the
> +behavior may
> + * fallback to a regular store.

I'm trying to understand write combining use cases here.
Is it applicable for all MMIO writes? 
How to identify where to use rte_write32_wc(_relaxed)?

Thanks.
/Ruifeng
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to  */ __rte_experimental
> +static inline void rte_write32_wc(uint32_t value, volatile void *addr);
> +
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write
> +combining
> + * may not be available and/or may be treated as a hint and the
> +behavior may
> + * fallback to a regular store.
> + *
> + * The relaxed version does not have additional I/O memory barrier,
> +useful in
> + * accessing the device registers of integrated controllers which
> +implicitly
> + * strongly ordered with respect to memory access.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to  */ __rte_experimental
> +static inline void rte_write32_wc_relaxed(uint32_t value, volatile void
> +*addr);
> +
>  #endif /* __DOXYGEN__ */
> 
>  #ifndef RTE_OVERRIDE_IO_H
> @@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
>  	rte_write64_relaxed(value, addr);
>  }
> 
> +#ifndef RTE_NATIVE_WRITE32_WC
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr) {
> +	rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr) {
> +	rte_write32_relaxed(value, addr);
> +}
> +#endif /* RTE_NATIVE_WRITE32_WC */
> +
>  #endif /* RTE_OVERRIDE_IO_H */
> 
>  #endif /* _RTE_IO_H_ */
> diff --git a/lib/librte_eal/x86/include/rte_io.h
> b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..c95ed67 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -9,8 +9,64 @@
>  extern "C" {
>  #endif
> 
> +#include "rte_cpuflags.h"
> +
> +#define RTE_NATIVE_WRITE32_WC
>  #include "generic/rte_io.h"
> 
> +/**
> + * @internal
> + * MOVDIRI wrapper.
> + */
> +static __rte_always_inline void
> +_rte_x86_movdiri(uint32_t value, volatile void *addr) {
> +	asm volatile(
> +		/* MOVDIRI */
> +		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +		:
> +		: "a" (value), "d" (addr));
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr) {
> +	static int _x86_movdiri_flag = -1;
> +	if (_x86_movdiri_flag == 1) {
> +		rte_wmb();
> +		_rte_x86_movdiri(value, addr);
> +	} else if (_x86_movdiri_flag == 0) {
> +		rte_write32(value, addr);
> +	} else {
> +		_x86_movdiri_flag =
> +
> 	(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +		if (_x86_movdiri_flag == 1) {
> +			rte_wmb();
> +			_rte_x86_movdiri(value, addr);
> +		} else {
> +			rte_write32(value, addr);
> +		}
> +	}
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr) {
> +	static int _x86_movdiri_flag = -1;
> +	if (_x86_movdiri_flag == 1) {
> +		_rte_x86_movdiri(value, addr);
> +	} else if (_x86_movdiri_flag == 0) {
> +		rte_write32_relaxed(value, addr);
> +	} else {
> +		_x86_movdiri_flag =
> +
> 	(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +		if (_x86_movdiri_flag == 1)
> +			_rte_x86_movdiri(value, addr);
> +		else
> +			rte_write32_relaxed(value, addr);
> +	}
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers
  2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-20  6:46     ` Ruifeng Wang
  2020-07-20  8:54       ` Nicolau, Radu
  0 siblings, 1 reply; 76+ messages in thread
From: Ruifeng Wang @ 2020-07-20  6:46 UTC (permalink / raw)
  To: Radu Nicolau, dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1, nd


> -----Original Message-----
> From: Radu Nicolau <radu.nicolau@intel.com>
> Sent: Friday, July 17, 2020 6:50 PM
> To: dev@dpdk.org
> Cc: beilei.xing@intel.com; jia.guo@intel.com; bruce.richardson@intel.com;
> konstantin.ananyev@intel.com; jerinjacobk@gmail.com;
> david.marchand@redhat.com; fiona.trahe@intel.com; wei.zhao1@intel.com;
> Ruifeng Wang <Ruifeng.Wang@arm.com>; Radu Nicolau
> <radu.nicolau@intel.com>
> Subject: [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers
> 
> Performance improvement: use a write combining store instead of a regular
> mmio write to update queue tail registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
>  drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
>  drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
> drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
>  4 files changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/i40e/base/i40e_osdep.h
> b/drivers/net/i40e/base/i40e_osdep.h
> index 58be396..69ab717 100644
> --- a/drivers/net/i40e/base/i40e_osdep.h
> +++ b/drivers/net/i40e/base/i40e_osdep.h
> @@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void
> *addr)
>  #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
>  	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
> 
> +#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \

'queue' is not necessary since it will not be used. It can be removed?

Thanks.
/Ruifeng
> +	rte_write32_wc((rte_cpu_to_le_32(value)), reg) #define
> +I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
> +	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
> +
>  #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
> #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
> 
> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index
> 840b6f3..64e43ac 100644
> --- a/drivers/net/i40e/i40e_rxtx.c
> +++ b/drivers/net/i40e/i40e_rxtx.c
> @@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  	if (nb_hold > rxq->rx_free_thresh) {
>  		rx_id = (uint16_t) ((rx_id == 0) ?
>  			(rxq->nb_rx_desc - 1) : (rx_id - 1));
> -		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
> +		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>  		nb_hold = 0;
>  	}
>  	rxq->nb_rx_hold = nb_hold;
> @@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
>  	if (nb_hold > rxq->rx_free_thresh) {
>  		rx_id = (uint16_t)(rx_id == 0 ?
>  			(rxq->nb_rx_desc - 1) : (rx_id - 1));
> -		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
> +		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>  		nb_hold = 0;
>  	}
>  	rxq->nb_rx_hold = nb_hold;
> @@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>  		   (unsigned) tx_id, (unsigned) nb_tx);
> 
>  	rte_cio_wmb();
> -	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
> +	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
>  	txq->tx_tail = tx_id;
> 
>  	return nb_tx;
> @@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
>  		txq->tx_tail = 0;
> 
>  	/* Update the tx tail register */
> -	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
> +	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
> 
>  	return nb_pkts;
>  }
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> index 3bcef13..294c1c4 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> @@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>  			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
> 
>  	/* Update the tail pointer on the NIC */
> -	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
> +	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>  }
> 
>  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> @@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue,
> struct rte_mbuf **tx_pkts,
> 
>  	txq->tx_tail = tx_id;
> 
> -	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
> +	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
> 
>  	return nb_pkts;
>  }
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c
> b/drivers/net/i40e/i40e_rxtx_vec_sse.c
> index 6985183..a4635e0 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
> @@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>  			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
> 
>  	/* Update the tail pointer on the NIC */
> -	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
> +	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>  }
> 
>  #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> @@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct
> rte_mbuf **tx_pkts,
> 
>  	txq->tx_tail = tx_id;
> 
> -	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
> +	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
> 
>  	return nb_pkts;
>  }
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/4] eal: add WC store functions
  2020-07-20  6:42     ` Ruifeng Wang
@ 2020-07-20  8:52       ` Nicolau, Radu
  0 siblings, 0 replies; 76+ messages in thread
From: Nicolau, Radu @ 2020-07-20  8:52 UTC (permalink / raw)
  To: Ruifeng Wang, dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1, nd


On 7/20/2020 7:42 AM, Ruifeng Wang wrote:
>> -----Original Message-----
>> From: Radu Nicolau <radu.nicolau@intel.com>
>> Sent: Friday, July 17, 2020 6:50 PM
>> To: dev@dpdk.org
>> Cc: beilei.xing@intel.com; jia.guo@intel.com; bruce.richardson@intel.com;
>> konstantin.ananyev@intel.com; jerinjacobk@gmail.com;
>> david.marchand@redhat.com; fiona.trahe@intel.com; wei.zhao1@intel.com;
>> Ruifeng Wang <Ruifeng.Wang@arm.com>; Radu Nicolau
>> <radu.nicolau@intel.com>
>> Subject: [PATCH v8 1/4] eal: add WC store functions
>>
>> Add rte_write32_wc and rte_write32_wc_relaxed functions that implement
>> 32bit stores using write combining memory protocol.
>> Provided generic stubs and x86 implementation.
>>
>> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
>> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
>> ---
>>   lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
>> lib/librte_eal/include/generic/rte_io.h | 48
>> ++++++++++++++++++++++++++++
>>   lib/librte_eal/x86/include/rte_io.h     | 56
>> +++++++++++++++++++++++++++++++++
>>   3 files changed, 116 insertions(+)
>>
>> diff --git a/lib/librte_eal/arm/include/rte_io_64.h
>> b/lib/librte_eal/arm/include/rte_io_64.h
>> index e534624..d07d9cb 100644
>> --- a/lib/librte_eal/arm/include/rte_io_64.h
>> +++ b/lib/librte_eal/arm/include/rte_io_64.h
>> @@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
>>   	rte_write64_relaxed(value, addr);
>>   }
>>
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr) {
>> +	rte_write32(value, addr);
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr) {
>> +	rte_write32_relaxed(value, addr);
>> +}
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_eal/include/generic/rte_io.h
>> b/lib/librte_eal/include/generic/rte_io.h
>> index da457f7..0669baa 100644
>> --- a/lib/librte_eal/include/generic/rte_io.h
>> +++ b/lib/librte_eal/include/generic/rte_io.h
>> @@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
>> static inline void  rte_write64(uint64_t value, volatile void *addr);
>>
>> +/**
>> + * Write a 32-bit value to I/O device memory address addr using write
>> + * combining memory write protocol. Depending on the platform write
>> +combining
>> + * may not be available and/or may be treated as a hint and the
>> +behavior may
>> + * fallback to a regular store.
> I'm trying to understand write combining use cases here.
> Is it applicable for all MMIO writes?

It's dependant on the architecture and specific use case, but generally 
this is a good usecase, updating the tail registers. It has some 
particularities that prevents it to be a replacement for mmio writes, it 
is weakly ordered and it will bypass the cache hierarchy.

> How to identify where to use rte_write32_wc(_relaxed)?
The relaxed version can be used is sections of the code that already 
have the proper fencing, as to avoid having a redundant memory fence, or 
when there is no need to have a memory fence at all.
>
> Thanks.
> /Ruifeng
>> + *
>> + * @param value
>> + *  Value to write
>> + * @param addr
>> + *  I/O memory address to write the value to  */ __rte_experimental
>> +static inline void rte_write32_wc(uint32_t value, volatile void *addr);
>> +
>> +/**
>> + * Write a 32-bit value to I/O device memory address addr using write
>> + * combining memory write protocol. Depending on the platform write
>> +combining
>> + * may not be available and/or may be treated as a hint and the
>> +behavior may
>> + * fallback to a regular store.
>> + *
>> + * The relaxed version does not have additional I/O memory barrier,
>> +useful in
>> + * accessing the device registers of integrated controllers which
>> +implicitly
>> + * strongly ordered with respect to memory access.
>> + *
>> + * @param value
>> + *  Value to write
>> + * @param addr
>> + *  I/O memory address to write the value to  */ __rte_experimental
>> +static inline void rte_write32_wc_relaxed(uint32_t value, volatile void
>> +*addr);
>> +
>>   #endif /* __DOXYGEN__ */
>>
>>   #ifndef RTE_OVERRIDE_IO_H
>> @@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
>>   	rte_write64_relaxed(value, addr);
>>   }
>>
>> +#ifndef RTE_NATIVE_WRITE32_WC
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr) {
>> +	rte_write32(value, addr);
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr) {
>> +	rte_write32_relaxed(value, addr);
>> +}
>> +#endif /* RTE_NATIVE_WRITE32_WC */
>> +
>>   #endif /* RTE_OVERRIDE_IO_H */
>>
>>   #endif /* _RTE_IO_H_ */
>> diff --git a/lib/librte_eal/x86/include/rte_io.h
>> b/lib/librte_eal/x86/include/rte_io.h
>> index 2db71b1..c95ed67 100644
>> --- a/lib/librte_eal/x86/include/rte_io.h
>> +++ b/lib/librte_eal/x86/include/rte_io.h
>> @@ -9,8 +9,64 @@
>>   extern "C" {
>>   #endif
>>
>> +#include "rte_cpuflags.h"
>> +
>> +#define RTE_NATIVE_WRITE32_WC
>>   #include "generic/rte_io.h"
>>
>> +/**
>> + * @internal
>> + * MOVDIRI wrapper.
>> + */
>> +static __rte_always_inline void
>> +_rte_x86_movdiri(uint32_t value, volatile void *addr) {
>> +	asm volatile(
>> +		/* MOVDIRI */
>> +		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
>> +		:
>> +		: "a" (value), "d" (addr));
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr) {
>> +	static int _x86_movdiri_flag = -1;
>> +	if (_x86_movdiri_flag == 1) {
>> +		rte_wmb();
>> +		_rte_x86_movdiri(value, addr);
>> +	} else if (_x86_movdiri_flag == 0) {
>> +		rte_write32(value, addr);
>> +	} else {
>> +		_x86_movdiri_flag =
>> +
>> 	(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
>> +		if (_x86_movdiri_flag == 1) {
>> +			rte_wmb();
>> +			_rte_x86_movdiri(value, addr);
>> +		} else {
>> +			rte_write32(value, addr);
>> +		}
>> +	}
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr) {
>> +	static int _x86_movdiri_flag = -1;
>> +	if (_x86_movdiri_flag == 1) {
>> +		_rte_x86_movdiri(value, addr);
>> +	} else if (_x86_movdiri_flag == 0) {
>> +		rte_write32_relaxed(value, addr);
>> +	} else {
>> +		_x86_movdiri_flag =
>> +
>> 	(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
>> +		if (_x86_movdiri_flag == 1)
>> +			_rte_x86_movdiri(value, addr);
>> +		else
>> +			rte_write32_relaxed(value, addr);
>> +	}
>> +}
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> --
>> 2.7.4

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers
  2020-07-20  6:46     ` Ruifeng Wang
@ 2020-07-20  8:54       ` Nicolau, Radu
  0 siblings, 0 replies; 76+ messages in thread
From: Nicolau, Radu @ 2020-07-20  8:54 UTC (permalink / raw)
  To: Ruifeng Wang, dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1, nd


On 7/20/2020 7:46 AM, Ruifeng Wang wrote:
>> -----Original Message-----
>> From: Radu Nicolau <radu.nicolau@intel.com>
>> Sent: Friday, July 17, 2020 6:50 PM
>> To: dev@dpdk.org
>> Cc: beilei.xing@intel.com; jia.guo@intel.com; bruce.richardson@intel.com;
>> konstantin.ananyev@intel.com; jerinjacobk@gmail.com;
>> david.marchand@redhat.com; fiona.trahe@intel.com; wei.zhao1@intel.com;
>> Ruifeng Wang <Ruifeng.Wang@arm.com>; Radu Nicolau
>> <radu.nicolau@intel.com>
>> Subject: [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers
>>
>> Performance improvement: use a write combining store instead of a regular
>> mmio write to update queue tail registers.
>>
>> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
>> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
>> ---
>>   drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
>>   drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
>>   drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
>> drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
>>   4 files changed, 13 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/net/i40e/base/i40e_osdep.h
>> b/drivers/net/i40e/base/i40e_osdep.h
>> index 58be396..69ab717 100644
>> --- a/drivers/net/i40e/base/i40e_osdep.h
>> +++ b/drivers/net/i40e/base/i40e_osdep.h
>> @@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void
>> *addr)
>>   #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
>>   	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
>>
>> +#define I40E_PCI_REG_WC_WRITE(queue, reg, value) \
> 'queue' is not necessary since it will not be used. It can be removed?
Yes, I will remove it - in the first version we had a flag in the queue 
struct, and this macro was not properly updated.
>
> Thanks.
> /Ruifeng
>> +	rte_write32_wc((rte_cpu_to_le_32(value)), reg) #define
>> +I40E_PCI_REG_WC_WRITE_RELAXED(queue, reg, value) \
>> +	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
>> +
>>   #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
>> #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
>>
>> diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index
>> 840b6f3..64e43ac 100644
>> --- a/drivers/net/i40e/i40e_rxtx.c
>> +++ b/drivers/net/i40e/i40e_rxtx.c
>> @@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf
>> **rx_pkts, uint16_t nb_pkts)
>>   	if (nb_hold > rxq->rx_free_thresh) {
>>   		rx_id = (uint16_t) ((rx_id == 0) ?
>>   			(rxq->nb_rx_desc - 1) : (rx_id - 1));
>> -		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
>> +		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>>   		nb_hold = 0;
>>   	}
>>   	rxq->nb_rx_hold = nb_hold;
>> @@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
>>   	if (nb_hold > rxq->rx_free_thresh) {
>>   		rx_id = (uint16_t)(rx_id == 0 ?
>>   			(rxq->nb_rx_desc - 1) : (rx_id - 1));
>> -		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
>> +		I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>>   		nb_hold = 0;
>>   	}
>>   	rxq->nb_rx_hold = nb_hold;
>> @@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf
>> **tx_pkts, uint16_t nb_pkts)
>>   		   (unsigned) tx_id, (unsigned) nb_tx);
>>
>>   	rte_cio_wmb();
>> -	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
>> +	I40E_PCI_REG_WC_WRITE_RELAXED(txq, txq->qtx_tail, tx_id);
>>   	txq->tx_tail = tx_id;
>>
>>   	return nb_tx;
>> @@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
>>   		txq->tx_tail = 0;
>>
>>   	/* Update the tx tail register */
>> -	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
>> +	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
>>
>>   	return nb_pkts;
>>   }
>> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
>> b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
>> index 3bcef13..294c1c4 100644
>> --- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
>> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
>> @@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>>   			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
>>
>>   	/* Update the tail pointer on the NIC */
>> -	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
>> +	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>>   }
>>
>>   #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>> @@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue,
>> struct rte_mbuf **tx_pkts,
>>
>>   	txq->tx_tail = tx_id;
>>
>> -	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
>> +	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
>>
>>   	return nb_pkts;
>>   }
>> diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c
>> b/drivers/net/i40e/i40e_rxtx_vec_sse.c
>> index 6985183..a4635e0 100644
>> --- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
>> +++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
>> @@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>>   			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
>>
>>   	/* Update the tail pointer on the NIC */
>> -	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
>> +	I40E_PCI_REG_WC_WRITE(rxq, rxq->qrx_tail, rx_id);
>>   }
>>
>>   #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>> @@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct
>> rte_mbuf **tx_pkts,
>>
>>   	txq->tx_tail = tx_id;
>>
>> -	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
>> +	I40E_PCI_REG_WC_WRITE(txq, txq->qtx_tail, txq->tx_tail);
>>
>>   	return nb_pkts;
>>   }
>> --
>> 2.7.4

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (9 preceding siblings ...)
  2020-07-17 10:49 ` [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-20  9:12 ` Radu Nicolau
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 1/4] " Radu Nicolau
                     ` (3 more replies)
  2020-07-21 11:31 ` [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions Radu Nicolau
                   ` (2 subsequent siblings)
  13 siblings, 4 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-20  9:12 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E, IXGBE and QAT PMDs are updated to
use the write combining store functions with other PMDs to follow.

Radu Nicolau (4):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers
  common/qat: use WC store to update queue tail registers
  net/ixgbe: use WC store to update queue tail registers

 .../qat/qat_adf/adf_transport_access_macros.h      |  6 ++-
 drivers/net/i40e/base/i40e_osdep.h                 |  5 ++
 drivers/net/i40e/i40e_rxtx.c                       |  8 ++--
 drivers/net/i40e/i40e_rxtx_vec_avx2.c              |  4 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c               |  4 +-
 drivers/net/ixgbe/base/ixgbe_osdep.h               |  6 +++
 drivers/net/ixgbe/ixgbe_rxtx.c                     | 15 +++---
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c             |  4 +-
 lib/librte_eal/arm/include/rte_io_64.h             | 12 +++++
 lib/librte_eal/include/generic/rte_io.h            | 48 +++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h                | 56 ++++++++++++++++++++++
 11 files changed, 149 insertions(+), 19 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v9 1/4] eal: add WC store functions
  2020-07-20  9:12 ` [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-20  9:12   ` Radu Nicolau
  2020-07-20 12:20     ` David Marchand
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-07-20  9:12 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
 lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
index e534624..d07d9cb 100644
--- a/lib/librte_eal/arm/include/rte_io_64.h
+++ b/lib/librte_eal/arm/include/rte_io_64.h
@@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..0669baa 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v9 2/4] net/i40e: use WC store to update queue tail registers
  2020-07-20  9:12 ` [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions Radu Nicolau
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 1/4] " Radu Nicolau
@ 2020-07-20  9:12   ` Radu Nicolau
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 3/4] common/qat: " Radu Nicolau
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 4/4] net/ixgbe: " Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-20  9:12 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..9b50330 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..f709c52 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..178d8f4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..240ce47 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v9 3/4] common/qat: use WC store to update queue tail registers
  2020-07-20  9:12 ` [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions Radu Nicolau
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 1/4] " Radu Nicolau
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-20  9:12   ` Radu Nicolau
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 4/4] net/ixgbe: " Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-20  9:12 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Fiona Trahe <fiona.trahe@intel.com>
---
 drivers/common/qat/qat_adf/adf_transport_access_macros.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/common/qat/qat_adf/adf_transport_access_macros.h b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
index 1eef551..504ffb7 100644
--- a/drivers/common/qat/qat_adf/adf_transport_access_macros.h
+++ b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
@@ -9,6 +9,8 @@
 /* CSR write macro */
 #define ADF_CSR_WR(csrAddr, csrOffset, val)		\
 	rte_write32(val, (((uint8_t *)csrAddr) + csrOffset))
+#define ADF_CSR_WC_WR(csrAddr, csrOffset, val)		\
+	rte_write32_wc(val, (((uint8_t *)csrAddr) + csrOffset))
 
 /* CSR read macro */
 #define ADF_CSR_RD(csrAddr, csrOffset)			\
@@ -110,10 +112,10 @@ do { \
 		ADF_RING_CSR_RING_UBASE + (ring << 2), u_base);	\
 } while (0)
 #define WRITE_CSR_RING_HEAD(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_HEAD + (ring << 2), value)
 #define WRITE_CSR_RING_TAIL(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_TAIL + (ring << 2), value)
 #define WRITE_CSR_INT_SRCSEL(csr_base_addr, bank) \
 do { \
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v9 4/4] net/ixgbe: use WC store to update queue tail registers
  2020-07-20  9:12 ` [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions Radu Nicolau
                     ` (2 preceding siblings ...)
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 3/4] common/qat: " Radu Nicolau
@ 2020-07-20  9:12   ` Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-20  9:12 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 drivers/net/ixgbe/base/ixgbe_osdep.h   |  6 ++++++
 drivers/net/ixgbe/ixgbe_rxtx.c         | 15 ++++++++-------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c |  4 ++--
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index dc712b7..cacf724 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
 #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 2e20e18..6701c07 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -946,7 +946,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1692,7 +1692,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		/* update tail pointer */
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
 					    cur_free_trigger);
 	}
 
@@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2096,8 +2096,9 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 
 			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
 				rte_wmb();
-				IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
-							    next_rdt);
+				IXGBE_PCI_REG_WC_WRITE_RELAXED(
+							rxq->rdt_reg_addr,
+							next_rdt);
 				nb_hold -= rxq->rx_free_thresh;
 			} else {
 				PMD_RX_LOG(DEBUG, "RX bulk alloc failed "
@@ -2262,7 +2263,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
 
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
 		nb_hold = 0;
 	}
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca31..e77a7f3 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
 #ifdef RTE_LIBRTE_SECURITY
@@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/4] eal: add WC store functions
  2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 1/4] " Radu Nicolau
@ 2020-07-20 12:20     ` David Marchand
  2020-07-21  8:56       ` Nicolau, Radu
  0 siblings, 1 reply; 76+ messages in thread
From: David Marchand @ 2020-07-20 12:20 UTC (permalink / raw)
  To: Radu Nicolau
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob, Trahe, Fiona, Wei Zhao,
	Ruifeng Wang (Arm Technology China)

On Mon, Jul 20, 2020 at 11:12 AM Radu Nicolau <radu.nicolau@intel.com> wrote:
>
> Add rte_write32_wc and rte_write32_wc_relaxed functions
> that implement 32bit stores using write combining memory protocol.
> Provided generic stubs and x86 implementation.

What is the difference of using this new API when compared to the
existing pci driver flag RTE_PCI_DRV_WC_ACTIVATE?
Do we have some overlap between the two?

This commitlog is quite short for something that touches performance.
I saw a question from Ruifeng, it is worth adding this to the commitlog.

Which x86 platforms will benefit from it?
What is the impact on performance for existing platforms that have no
MOVDIRI support?


>
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
>  lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
>  lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
>  3 files changed, 116 insertions(+)
>
> diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
> index e534624..d07d9cb 100644
> --- a/lib/librte_eal/arm/include/rte_io_64.h
> +++ b/lib/librte_eal/arm/include/rte_io_64.h
> @@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
>         rte_write64_relaxed(value, addr);
>  }
>
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       rte_write32_relaxed(value, addr);
> +}
> +

We were using a single knob RTE_OVERRIDE_IO_H for overriding the whole rte_io.h.
Now we would have a special case for an API for x86 and the code is
copy/pasted in the ARM header and keeping the "whole" override mode.

This leaves an unfinished taste.

Why did you not flag all relevant "native" helpers?
This would factor some code from the ARM header.


>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
> index da457f7..0669baa 100644
> --- a/lib/librte_eal/include/generic/rte_io.h
> +++ b/lib/librte_eal/include/generic/rte_io.h
> @@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
>  static inline void
>  rte_write64(uint64_t value, volatile void *addr);
>
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +__rte_experimental
> +static inline void
> +rte_write32_wc(uint32_t value, volatile void *addr);
> +
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * The relaxed version does not have additional I/O memory barrier, useful in
> + * accessing the device registers of integrated controllers which implicitly
> + * strongly ordered with respect to memory access.

It might be just me, but I have trouble reading the last part of this sentence.
Maybe remove "with respect to"?


> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +__rte_experimental
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
>  #endif /* __DOXYGEN__ */
>
>  #ifndef RTE_OVERRIDE_IO_H
> @@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
>         rte_write64_relaxed(value, addr);
>  }
>
> +#ifndef RTE_NATIVE_WRITE32_WC
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       rte_write32_relaxed(value, addr);
> +}
> +#endif /* RTE_NATIVE_WRITE32_WC */
> +
>  #endif /* RTE_OVERRIDE_IO_H */
>
>  #endif /* _RTE_IO_H_ */
> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..c95ed67 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -9,8 +9,64 @@
>  extern "C" {
>  #endif
>
> +#include "rte_cpuflags.h"
> +
> +#define RTE_NATIVE_WRITE32_WC
>  #include "generic/rte_io.h"
>
> +/**
> + * @internal
> + * MOVDIRI wrapper.
> + */
> +static __rte_always_inline void
> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
> +{
> +       asm volatile(
> +               /* MOVDIRI */
> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +               :
> +               : "a" (value), "d" (addr));
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;
> +       if (_x86_movdiri_flag == 1) {
> +               rte_wmb();
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +               if (_x86_movdiri_flag == 1) {
> +                       rte_wmb();
> +                       _rte_x86_movdiri(value, addr);
> +               } else {
> +                       rte_write32(value, addr);
> +               }
> +       }
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;
> +       if (_x86_movdiri_flag == 1) {
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32_relaxed(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +               if (_x86_movdiri_flag == 1)
> +                       _rte_x86_movdiri(value, addr);
> +               else
> +                       rte_write32_relaxed(value, addr);
> +       }
> +}
> +

Repeating some comments I made earlier.

- If a single helper called by both rte_write32_wc and
rte_write32_wc_relaxed with a _constant_ flag is not to your liking (I
don't see where it would have an impact on performance), then maybe
rte_write32_wc() can be simply implemented as:

+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+    rte_wmb();
+    rte_write32_wc_relaxed(value, addr);
+}
+

- Looking at this above suggestion, I wonder about the non-relaxed case.
Is rte_io_wmb() not enough?


- The cpuflag check can be resolved at init once and for all.
By this, I mean in lib/librte_eal/x86/rte_cpuflags.c:

+int rte_x86_movdiri_flag = -1;
+
+RTE_INIT(rte_x86_movdiri_init)
+{
+       rte_x86_movdiri_flag =
+               (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+}

The variable can be exported in lib/librte_eal/x86/include/rte_cpuflags.h.

Then rte_write32_wc_relaxed() becomes:

+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+    if (rte_x86_movdiri_flag == 1) {
+        asm volatile(
+            /* MOVDIRI */
+            ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+            :
+            : "a" (value), "d" (addr));
+        return;
+    }
+
+    rte_write32_relaxed(value, addr);
+}
+


Thanks.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/4] eal: add WC store functions
  2020-07-20 12:20     ` David Marchand
@ 2020-07-21  8:56       ` Nicolau, Radu
  0 siblings, 0 replies; 76+ messages in thread
From: Nicolau, Radu @ 2020-07-21  8:56 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob, Trahe, Fiona, Wei Zhao,
	Ruifeng Wang (Arm Technology China)


On 7/20/2020 1:20 PM, David Marchand wrote:
> On Mon, Jul 20, 2020 at 11:12 AM Radu Nicolau <radu.nicolau@intel.com> wrote:
>> Add rte_write32_wc and rte_write32_wc_relaxed functions
>> that implement 32bit stores using write combining memory protocol.
>> Provided generic stubs and x86 implementation.
> What is the difference of using this new API when compared to the
> existing pci driver flag RTE_PCI_DRV_WC_ACTIVATE?
> Do we have some overlap between the two?
No, the RTE_PCI_DRV_WC_ACTIVATE define will map the BARs as write 
combining, whereas these functions will use the WC regardless of the 
mapping, and they can be used for memory areas that won't be mapped 
using the PCI infrastructure.
>
> This commitlog is quite short for something that touches performance.
> I saw a question from Ruifeng, it is worth adding this to the commitlog.

Strictly speaking, this patch only enables support for WC stores, and it 
does not have any performance implications on its own. I think a fair 
assumption is that if someone is looking to use WC stores they will know 
exactly how and where they can be used.

>
> Which x86 platforms will benefit from it?
> What is the impact on performance for existing platforms that have no
> MOVDIRI support?

x86 platforms supporting MOVDIRI instruction with this particular patch, 
and any other platform that have similar instructions if they were to be 
enabled.

There is no impact for the ones that don't support it.

>
>
>> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
>> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
>> ---
>>   lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
>>   lib/librte_eal/include/generic/rte_io.h | 48 ++++++++++++++++++++++++++++
>>   lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
>>   3 files changed, 116 insertions(+)
>>
>> diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
>> index e534624..d07d9cb 100644
>> --- a/lib/librte_eal/arm/include/rte_io_64.h
>> +++ b/lib/librte_eal/arm/include/rte_io_64.h
>> @@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
>>          rte_write64_relaxed(value, addr);
>>   }
>>
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32(value, addr);
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32_relaxed(value, addr);
>> +}
>> +
> We were using a single knob RTE_OVERRIDE_IO_H for overriding the whole rte_io.h.
> Now we would have a special case for an API for x86 and the code is
> copy/pasted in the ARM header and keeping the "whole" override mode.
>
> This leaves an unfinished taste.
>
> Why did you not flag all relevant "native" helpers?
> This would factor some code from the ARM header.
I agree that having a more granular approach is better, having a single 
knob is why ARM header had about half of the functions overridden and 
half copied and pasted before this patch. But this is outside the scope 
of this patch.
>
>
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
>> index da457f7..0669baa 100644
>> --- a/lib/librte_eal/include/generic/rte_io.h
>> +++ b/lib/librte_eal/include/generic/rte_io.h
>> @@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
>>   static inline void
>>   rte_write64(uint64_t value, volatile void *addr);
>>
>> +/**
>> + * Write a 32-bit value to I/O device memory address addr using write
>> + * combining memory write protocol. Depending on the platform write combining
>> + * may not be available and/or may be treated as a hint and the behavior may
>> + * fallback to a regular store.
>> + *
>> + * @param value
>> + *  Value to write
>> + * @param addr
>> + *  I/O memory address to write the value to
>> + */
>> +__rte_experimental
>> +static inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr);
>> +
>> +/**
>> + * Write a 32-bit value to I/O device memory address addr using write
>> + * combining memory write protocol. Depending on the platform write combining
>> + * may not be available and/or may be treated as a hint and the behavior may
>> + * fallback to a regular store.
>> + *
>> + * The relaxed version does not have additional I/O memory barrier, useful in
>> + * accessing the device registers of integrated controllers which implicitly
>> + * strongly ordered with respect to memory access.
> It might be just me, but I have trouble reading the last part of this sentence.
> Maybe remove "with respect to"?
It was copied and pasted from the other _relaxed functions for 
consistency reasons.
>
>
>> + *
>> + * @param value
>> + *  Value to write
>> + * @param addr
>> + *  I/O memory address to write the value to
>> + */
>> +__rte_experimental
>> +static inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
>> +
>>   #endif /* __DOXYGEN__ */
>>
>>   #ifndef RTE_OVERRIDE_IO_H
>> @@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
>>          rte_write64_relaxed(value, addr);
>>   }
>>
>> +#ifndef RTE_NATIVE_WRITE32_WC
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32(value, addr);
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32_relaxed(value, addr);
>> +}
>> +#endif /* RTE_NATIVE_WRITE32_WC */
>> +
>>   #endif /* RTE_OVERRIDE_IO_H */
>>
>>   #endif /* _RTE_IO_H_ */
>> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
>> index 2db71b1..c95ed67 100644
>> --- a/lib/librte_eal/x86/include/rte_io.h
>> +++ b/lib/librte_eal/x86/include/rte_io.h
>> @@ -9,8 +9,64 @@
>>   extern "C" {
>>   #endif
>>
>> +#include "rte_cpuflags.h"
>> +
>> +#define RTE_NATIVE_WRITE32_WC
>>   #include "generic/rte_io.h"
>>
>> +/**
>> + * @internal
>> + * MOVDIRI wrapper.
>> + */
>> +static __rte_always_inline void
>> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
>> +{
>> +       asm volatile(
>> +               /* MOVDIRI */
>> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
>> +               :
>> +               : "a" (value), "d" (addr));
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
>> +       if (_x86_movdiri_flag == 1) {
>> +               rte_wmb();
>> +               _rte_x86_movdiri(value, addr);
>> +       } else if (_x86_movdiri_flag == 0) {
>> +               rte_write32(value, addr);
>> +       } else {
>> +               _x86_movdiri_flag =
>> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
>> +               if (_x86_movdiri_flag == 1) {
>> +                       rte_wmb();
>> +                       _rte_x86_movdiri(value, addr);
>> +               } else {
>> +                       rte_write32(value, addr);
>> +               }
>> +       }
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
>> +       if (_x86_movdiri_flag == 1) {
>> +               _rte_x86_movdiri(value, addr);
>> +       } else if (_x86_movdiri_flag == 0) {
>> +               rte_write32_relaxed(value, addr);
>> +       } else {
>> +               _x86_movdiri_flag =
>> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
>> +               if (_x86_movdiri_flag == 1)
>> +                       _rte_x86_movdiri(value, addr);
>> +               else
>> +                       rte_write32_relaxed(value, addr);
>> +       }
>> +}
>> +
> Repeating some comments I made earlier.
>
> - If a single helper called by both rte_write32_wc and
> rte_write32_wc_relaxed with a _constant_ flag is not to your liking (I
> don't see where it would have an impact on performance), then maybe
> rte_write32_wc() can be simply implemented as:
>
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +    rte_wmb();
> +    rte_write32_wc_relaxed(value, addr);
> +}
> +
Yes, it can be written as such, I will update the patch.
>
> - Looking at this above suggestion, I wonder about the non-relaxed case.
> Is rte_io_wmb() not enough?
No, we need an actual memory fence, on x86 rte_io_wmb() is defined as 
rte_compiler_barrier()
>
>
> - The cpuflag check can be resolved at init once and for all.
> By this, I mean in lib/librte_eal/x86/rte_cpuflags.c:
>
> +int rte_x86_movdiri_flag = -1;
> +
> +RTE_INIT(rte_x86_movdiri_init)
> +{
> +       rte_x86_movdiri_flag =
> +               (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +}
>
> The variable can be exported in lib/librte_eal/x86/include/rte_cpuflags.h.
>
> Then rte_write32_wc_relaxed() becomes:
>
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +    if (rte_x86_movdiri_flag == 1) {
> +        asm volatile(
> +            /* MOVDIRI */
> +            ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +            :
> +            : "a" (value), "d" (addr));
> +        return;
> +    }
> +
> +    rte_write32_relaxed(value, addr);
> +}
> +
I tried this before sending the patch, there is an issue when linking 
the shared library version with the --no-undefined flag. This will 
require to have the variable exported in the overall map file, requiring 
a change across all platforms for something that is very x86 specific.
>
> Thanks.
>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (10 preceding siblings ...)
  2020-07-20  9:12 ` [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-21 11:31 ` Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 1/4] " Radu Nicolau
                     ` (3 more replies)
  2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
  13 siblings, 4 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-21 11:31 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E, IXGBE and QAT PMDs are updated to
use the write combining store functions with other PMDs to follow.

Radu Nicolau (4):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers
  common/qat: use WC store to update queue tail registers
  net/ixgbe: use WC store to update queue tail registers

 .../qat/qat_adf/adf_transport_access_macros.h      |  6 ++-
 drivers/net/i40e/base/i40e_osdep.h                 |  5 +++
 drivers/net/i40e/i40e_rxtx.c                       |  8 ++--
 drivers/net/i40e/i40e_rxtx_vec_avx2.c              |  4 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c               |  4 +-
 drivers/net/ixgbe/base/ixgbe_osdep.h               |  6 +++
 drivers/net/ixgbe/ixgbe_rxtx.c                     | 15 +++----
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c             |  4 +-
 lib/librte_eal/arm/include/rte_io_64.h             | 12 ++++++
 lib/librte_eal/include/generic/rte_io.h            | 48 ++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h                | 42 +++++++++++++++++++
 11 files changed, 135 insertions(+), 19 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v10 1/4] eal: add WC store functions
  2020-07-21 11:31 ` [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions Radu Nicolau
@ 2020-07-21 11:31   ` Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-21 11:31 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++++
 lib/librte_eal/include/generic/rte_io.h | 48 +++++++++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 42 +++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
index e534624..d07d9cb 100644
--- a/lib/librte_eal/arm/include/rte_io_64.h
+++ b/lib/librte_eal/arm/include/rte_io_64.h
@@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..0669baa 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..4f4ff8b 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,50 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_wmb();
+	rte_write32_wc_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v10 2/4] net/i40e: use WC store to update queue tail registers
  2020-07-21 11:31 ` [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 1/4] " Radu Nicolau
@ 2020-07-21 11:31   ` Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 3/4] common/qat: " Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 4/4] net/ixgbe: " Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-21 11:31 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be396..9b50330 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 840b6f3..f709c52 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef13..178d8f4 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 6985183..240ce47 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v10 3/4] common/qat: use WC store to update queue tail registers
  2020-07-21 11:31 ` [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 1/4] " Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-07-21 11:31   ` Radu Nicolau
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 4/4] net/ixgbe: " Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-21 11:31 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Fiona Trahe <fiona.trahe@intel.com>
---
 drivers/common/qat/qat_adf/adf_transport_access_macros.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/common/qat/qat_adf/adf_transport_access_macros.h b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
index 1eef551..504ffb7 100644
--- a/drivers/common/qat/qat_adf/adf_transport_access_macros.h
+++ b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
@@ -9,6 +9,8 @@
 /* CSR write macro */
 #define ADF_CSR_WR(csrAddr, csrOffset, val)		\
 	rte_write32(val, (((uint8_t *)csrAddr) + csrOffset))
+#define ADF_CSR_WC_WR(csrAddr, csrOffset, val)		\
+	rte_write32_wc(val, (((uint8_t *)csrAddr) + csrOffset))
 
 /* CSR read macro */
 #define ADF_CSR_RD(csrAddr, csrOffset)			\
@@ -110,10 +112,10 @@ do { \
 		ADF_RING_CSR_RING_UBASE + (ring << 2), u_base);	\
 } while (0)
 #define WRITE_CSR_RING_HEAD(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_HEAD + (ring << 2), value)
 #define WRITE_CSR_RING_TAIL(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_TAIL + (ring << 2), value)
 #define WRITE_CSR_INT_SRCSEL(csr_base_addr, bank) \
 do { \
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v10 4/4] net/ixgbe: use WC store to update queue tail registers
  2020-07-21 11:31 ` [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions Radu Nicolau
                     ` (2 preceding siblings ...)
  2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 3/4] common/qat: " Radu Nicolau
@ 2020-07-21 11:31   ` Radu Nicolau
  3 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-07-21 11:31 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 drivers/net/ixgbe/base/ixgbe_osdep.h   |  6 ++++++
 drivers/net/ixgbe/ixgbe_rxtx.c         | 15 ++++++++-------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c |  4 ++--
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index dc712b7..cacf724 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
 #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 2e20e18..6701c07 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -946,7 +946,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1692,7 +1692,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		/* update tail pointer */
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
 					    cur_free_trigger);
 	}
 
@@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2096,8 +2096,9 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 
 			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
 				rte_wmb();
-				IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
-							    next_rdt);
+				IXGBE_PCI_REG_WC_WRITE_RELAXED(
+							rxq->rdt_reg_addr,
+							next_rdt);
 				nb_hold -= rxq->rx_free_thresh;
 			} else {
 				PMD_RX_LOG(DEBUG, "RX bulk alloc failed "
@@ -2262,7 +2263,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
 
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
 		nb_hold = 0;
 	}
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca31..e77a7f3 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
 #ifdef RTE_LIBRTE_SECURITY
@@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.7.4


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (11 preceding siblings ...)
  2020-07-21 11:31 ` [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions Radu Nicolau
@ 2020-08-26  9:55 ` Radu Nicolau
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 1/5] " Radu Nicolau
                     ` (4 more replies)
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
  13 siblings, 5 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-08-26  9:55 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E, ICE, IXGBE and QAT PMDs are updated to
use the write combining store functions with other PMDs to follow.

Radu Nicolau (5):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers
  common/qat: use WC store to update queue tail registers
  net/ixgbe: use WC store to update queue tail registers
  net/ice: use WC store to update queue tail registers

 .../qat/qat_adf/adf_transport_access_macros.h |  6 ++-
 drivers/net/i40e/base/i40e_osdep.h            |  5 ++
 drivers/net/i40e/i40e_rxtx.c                  |  8 ++--
 drivers/net/i40e/i40e_rxtx_vec_avx2.c         |  4 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c          |  4 +-
 drivers/net/ice/base/ice_osdep.h              |  1 +
 drivers/net/ice/ice_rxtx.c                    |  6 +--
 drivers/net/ice/ice_rxtx_vec_avx2.c           |  4 +-
 drivers/net/ice/ice_rxtx_vec_sse.c            |  4 +-
 drivers/net/ixgbe/base/ixgbe_osdep.h          |  6 +++
 drivers/net/ixgbe/ixgbe_rxtx.c                | 15 +++---
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c        |  4 +-
 lib/librte_eal/arm/include/rte_io_64.h        | 12 +++++
 lib/librte_eal/include/generic/rte_io.h       | 48 +++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h           | 42 ++++++++++++++++
 15 files changed, 143 insertions(+), 26 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v11 1/5] eal: add WC store functions
  2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
@ 2020-08-26  9:55   ` Radu Nicolau
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 2/5] net/i40e: use WC store to update queue tail registers Radu Nicolau
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-08-26  9:55 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
 lib/librte_eal/include/generic/rte_io.h | 48 +++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 42 ++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
index e5346240e..d07d9cb22 100644
--- a/lib/librte_eal/arm/include/rte_io_64.h
+++ b/lib/librte_eal/arm/include/rte_io_64.h
@@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7f7..0669baa0b 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1b0..4f4ff8b87 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,50 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_wmb();
+	rte_write32_wc_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v11 2/5] net/i40e: use WC store to update queue tail registers
  2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 1/5] " Radu Nicolau
@ 2020-08-26  9:55   ` Radu Nicolau
  2020-09-23  1:19     ` Lu, Wenzhuo
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 3/5] common/qat: " Radu Nicolau
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-08-26  9:55 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 drivers/net/i40e/base/i40e_osdep.h    | 5 +++++
 drivers/net/i40e/i40e_rxtx.c          | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be39677..9b5033024 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index fe7f9200c..76d634091 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3bcef1363..178d8f4e2 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 698518349..240ce478a 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v11 3/5] common/qat: use WC store to update queue tail registers
  2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 1/5] " Radu Nicolau
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 2/5] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-08-26  9:55   ` Radu Nicolau
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 4/5] net/ixgbe: " Radu Nicolau
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 5/5] net/ice: " Radu Nicolau
  4 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-08-26  9:55 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Fiona Trahe <fiona.trahe@intel.com>
---
 drivers/common/qat/qat_adf/adf_transport_access_macros.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/common/qat/qat_adf/adf_transport_access_macros.h b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
index 1eef5513f..504ffb723 100644
--- a/drivers/common/qat/qat_adf/adf_transport_access_macros.h
+++ b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
@@ -9,6 +9,8 @@
 /* CSR write macro */
 #define ADF_CSR_WR(csrAddr, csrOffset, val)		\
 	rte_write32(val, (((uint8_t *)csrAddr) + csrOffset))
+#define ADF_CSR_WC_WR(csrAddr, csrOffset, val)		\
+	rte_write32_wc(val, (((uint8_t *)csrAddr) + csrOffset))
 
 /* CSR read macro */
 #define ADF_CSR_RD(csrAddr, csrOffset)			\
@@ -110,10 +112,10 @@ do { \
 		ADF_RING_CSR_RING_UBASE + (ring << 2), u_base);	\
 } while (0)
 #define WRITE_CSR_RING_HEAD(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_HEAD + (ring << 2), value)
 #define WRITE_CSR_RING_TAIL(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_TAIL + (ring << 2), value)
 #define WRITE_CSR_INT_SRCSEL(csr_base_addr, bank) \
 do { \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v11 4/5] net/ixgbe: use WC store to update queue tail registers
  2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
                     ` (2 preceding siblings ...)
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 3/5] common/qat: " Radu Nicolau
@ 2020-08-26  9:55   ` Radu Nicolau
  2020-09-23  1:20     ` Lu, Wenzhuo
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 5/5] net/ice: " Radu Nicolau
  4 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-08-26  9:55 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 drivers/net/ixgbe/base/ixgbe_osdep.h   |  6 ++++++
 drivers/net/ixgbe/ixgbe_rxtx.c         | 15 ++++++++-------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c |  4 ++--
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index dc712b7c0..cacf72419 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
 #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 977ecf513..29d385c06 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -946,7 +946,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1692,7 +1692,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		/* update tail pointer */
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
 					    cur_free_trigger);
 	}
 
@@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2096,8 +2096,9 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 
 			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
 				rte_wmb();
-				IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
-							    next_rdt);
+				IXGBE_PCI_REG_WC_WRITE_RELAXED(
+							rxq->rdt_reg_addr,
+							next_rdt);
 				nb_hold -= rxq->rx_free_thresh;
 			} else {
 				PMD_RX_LOG(DEBUG, "RX bulk alloc failed "
@@ -2262,7 +2263,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
 
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
 		nb_hold = 0;
 	}
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca3166..e77a7f31c 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
 #ifdef RTE_LIBRTE_SECURITY
@@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v11 5/5] net/ice: use WC store to update queue tail registers
  2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
                     ` (3 preceding siblings ...)
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 4/5] net/ixgbe: " Radu Nicolau
@ 2020-08-26  9:55   ` Radu Nicolau
  2020-09-23  1:20     ` Lu, Wenzhuo
  4 siblings, 1 reply; 76+ messages in thread
From: Radu Nicolau @ 2020-08-26  9:55 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
---
 drivers/net/ice/base/ice_osdep.h    | 1 +
 drivers/net/ice/ice_rxtx.c          | 6 +++---
 drivers/net/ice/ice_rxtx_vec_avx2.c | 4 ++--
 drivers/net/ice/ice_rxtx_vec_sse.c  | 4 ++--
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ice/base/ice_osdep.h b/drivers/net/ice/base/ice_osdep.h
index 360e435b8..c793f1e38 100644
--- a/drivers/net/ice/base/ice_osdep.h
+++ b/drivers/net/ice/base/ice_osdep.h
@@ -165,6 +165,7 @@ do {									\
 #endif
 
 #define ICE_PCI_REG_WRITE(reg, value) writel(value, reg)
+#define ICE_PCI_REG_WC_WRITE(reg, value) rte_write32_wc(value, reg)
 
 #define ICE_READ_REG(hw, reg)         rd32(hw, reg)
 #define ICE_WRITE_REG(hw, reg, value) wr32(hw, reg, value)
diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 2e1f06d2c..d23841623 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -1741,7 +1741,7 @@ ice_recv_scattered_pkts(void *rx_queue,
 		rx_id = (uint16_t)(rx_id == 0 ?
 				   (rxq->nb_rx_desc - 1) : (rx_id - 1));
 		/* write TAIL register */
-		ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2131,7 +2131,7 @@ ice_recv_pkts(void *rx_queue,
 		rx_id = (uint16_t)(rx_id == 0 ?
 				   (rxq->nb_rx_desc - 1) : (rx_id - 1));
 		/* write TAIL register */
-		ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2846,7 +2846,7 @@ tx_xmit_pkts(struct ice_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..d9d475863 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -129,7 +129,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 static inline uint16_t
@@ -810,7 +810,7 @@ ice_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..3de162af0 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -78,7 +78,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 			   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 static inline void
@@ -583,7 +583,7 @@ ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v11 2/5] net/i40e: use WC store to update queue tail registers
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 2/5] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-09-23  1:19     ` Lu, Wenzhuo
  0 siblings, 0 replies; 76+ messages in thread
From: Lu, Wenzhuo @ 2020-09-23  1:19 UTC (permalink / raw)
  To: Nicolau, Radu, dev
  Cc: Xing, Beilei, Guo, Jia, Richardson, Bruce, Ananyev, Konstantin,
	jerinjacobk, david.marchand, Trahe, Fiona, Zhao1, Wei,
	ruifeng.wang, Yang, Qiming, Zhang, Qi Z, Nicolau, Radu

Hi,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Radu Nicolau
> Sent: Wednesday, August 26, 2020 5:56 PM
> To: dev@dpdk.org
> Cc: Xing, Beilei <beilei.xing@intel.com>; Guo, Jia <jia.guo@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; jerinjacobk@gmail.com;
> david.marchand@redhat.com; Trahe, Fiona <fiona.trahe@intel.com>; Zhao1,
> Wei <wei.zhao1@intel.com>; ruifeng.wang@arm.com; Yang, Qiming
> <qiming.yang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Nicolau, Radu
> <radu.nicolau@intel.com>
> Subject: [dpdk-dev] [PATCH v11 2/5] net/i40e: use WC store to update queue tail
> registers
> 
> Performance improvement: use a write combining store instead of a regular
> mmio write to update queue tail registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Reviewed-by: Wenzhuo Lu <wenzhuo.lu@intel.com>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v11 4/5] net/ixgbe: use WC store to update queue tail registers
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 4/5] net/ixgbe: " Radu Nicolau
@ 2020-09-23  1:20     ` Lu, Wenzhuo
  0 siblings, 0 replies; 76+ messages in thread
From: Lu, Wenzhuo @ 2020-09-23  1:20 UTC (permalink / raw)
  To: Nicolau, Radu, dev
  Cc: Xing, Beilei, Guo, Jia, Richardson, Bruce, Ananyev, Konstantin,
	jerinjacobk, david.marchand, Trahe, Fiona, Zhao1, Wei,
	ruifeng.wang, Yang, Qiming, Zhang, Qi Z, Nicolau, Radu

Hi,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Radu Nicolau
> Sent: Wednesday, August 26, 2020 5:56 PM
> To: dev@dpdk.org
> Cc: Xing, Beilei <beilei.xing@intel.com>; Guo, Jia <jia.guo@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; jerinjacobk@gmail.com;
> david.marchand@redhat.com; Trahe, Fiona <fiona.trahe@intel.com>; Zhao1,
> Wei <wei.zhao1@intel.com>; ruifeng.wang@arm.com; Yang, Qiming
> <qiming.yang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Nicolau, Radu
> <radu.nicolau@intel.com>
> Subject: [dpdk-dev] [PATCH v11 4/5] net/ixgbe: use WC store to update queue
> tail registers
> 
> Performance improvement: use a write combining store instead of a regular
> mmio write to update queue tail registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Reviewed-by: Wenzhuo Lu <wenzhuo.lu@intel.com>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v11 5/5] net/ice: use WC store to update queue tail registers
  2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 5/5] net/ice: " Radu Nicolau
@ 2020-09-23  1:20     ` Lu, Wenzhuo
  0 siblings, 0 replies; 76+ messages in thread
From: Lu, Wenzhuo @ 2020-09-23  1:20 UTC (permalink / raw)
  To: Nicolau, Radu, dev
  Cc: Xing, Beilei, Guo, Jia, Richardson, Bruce, Ananyev, Konstantin,
	jerinjacobk, david.marchand, Trahe, Fiona, Zhao1, Wei,
	ruifeng.wang, Yang, Qiming, Zhang, Qi Z, Nicolau, Radu

Hi,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Radu Nicolau
> Sent: Wednesday, August 26, 2020 5:56 PM
> To: dev@dpdk.org
> Cc: Xing, Beilei <beilei.xing@intel.com>; Guo, Jia <jia.guo@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; jerinjacobk@gmail.com;
> david.marchand@redhat.com; Trahe, Fiona <fiona.trahe@intel.com>; Zhao1,
> Wei <wei.zhao1@intel.com>; ruifeng.wang@arm.com; Yang, Qiming
> <qiming.yang@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Nicolau, Radu
> <radu.nicolau@intel.com>
> Subject: [dpdk-dev] [PATCH v11 5/5] net/ice: use WC store to update queue tail
> registers
> 
> Performance improvement: use a write combining store instead of a regular
> mmio write to update queue tail registers.
> 
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Reviewed-by: Wenzhuo Lu <wenzhuo.lu@intel.com>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions
  2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
                   ` (12 preceding siblings ...)
  2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
@ 2020-09-23 14:22 ` Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 1/5] " Radu Nicolau
                     ` (6 more replies)
  13 siblings, 7 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-09-23 14:22 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Implement 2 new functions that will enable write combining
stores depending on architecture. The functions are provided
as a generic stub and a x86 specific implementation.

The reason to implement these functions is to improve performance
by reducing the overhead associated with regular mmio writes when
updating the hardware queue tails and doorbells.

With this patch set the I40E, ICE, IXGBE and QAT PMDs are updated to
use the write combining store functions with other PMDs to follow.


Radu Nicolau (5):
  eal: add WC store functions
  net/i40e: use WC store to update queue tail registers
  common/qat: use WC store to update queue tail registers
  net/ixgbe: use WC store to update queue tail registers
  net/ice: use WC store to update queue tail registers

 doc/guides/rel_notes/release_20_11.rst        | 22 +++++++++
 .../qat/qat_adf/adf_transport_access_macros.h |  6 ++-
 drivers/net/i40e/base/i40e_osdep.h            |  5 ++
 drivers/net/i40e/i40e_rxtx.c                  |  8 ++--
 drivers/net/i40e/i40e_rxtx_vec_avx2.c         |  4 +-
 drivers/net/i40e/i40e_rxtx_vec_sse.c          |  4 +-
 drivers/net/ice/base/ice_osdep.h              |  1 +
 drivers/net/ice/ice_rxtx.c                    |  6 +--
 drivers/net/ice/ice_rxtx_vec_avx2.c           |  4 +-
 drivers/net/ice/ice_rxtx_vec_sse.c            |  4 +-
 drivers/net/ixgbe/base/ixgbe_osdep.h          |  6 +++
 drivers/net/ixgbe/ixgbe_rxtx.c                | 15 +++---
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c        |  4 +-
 lib/librte_eal/arm/include/rte_io_64.h        | 12 +++++
 lib/librte_eal/include/generic/rte_io.h       | 48 +++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h           | 42 ++++++++++++++++
 16 files changed, 165 insertions(+), 26 deletions(-)

-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v12 1/5] eal: add WC store functions
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
@ 2020-09-23 14:22   ` Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 2/5] net/i40e: use WC store to update queue tail registers Radu Nicolau
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-09-23 14:22 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
 doc/guides/rel_notes/release_20_11.rst  |  6 ++++
 lib/librte_eal/arm/include/rte_io_64.h  | 12 +++++++
 lib/librte_eal/include/generic/rte_io.h | 48 +++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 42 ++++++++++++++++++++++
 4 files changed, 108 insertions(+)

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index c6642f5f9..f51577684 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -78,6 +78,12 @@ New Features
     ``--portmask=N``
     where N represents the hexadecimal bitmask of ports used.
 
+* **Added write combining store APIs.**
+
+  Added ``rte_write32_wc`` and ``rte_write32_wc_relaxed`` APIs
+  that enable write combining stores (depending on architecture).
+  The functions are provided as a generic stubs and
+  x86 specific implementation.
 
 Removed Items
 -------------
diff --git a/lib/librte_eal/arm/include/rte_io_64.h b/lib/librte_eal/arm/include/rte_io_64.h
index e5346240e..d07d9cb22 100644
--- a/lib/librte_eal/arm/include/rte_io_64.h
+++ b/lib/librte_eal/arm/include/rte_io_64.h
@@ -164,6 +164,18 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7f7..0669baa0b 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,40 @@ rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+__rte_experimental
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +379,20 @@ rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1b0..4f4ff8b87 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,50 @@
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_wmb();
+	rte_write32_wc_relaxed(value, addr);
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v12 2/5] net/i40e: use WC store to update queue tail registers
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 1/5] " Radu Nicolau
@ 2020-09-23 14:22   ` Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 3/5] common/qat: " Radu Nicolau
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-09-23 14:22 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Reviewed-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_20_11.rst | 4 ++++
 drivers/net/i40e/base/i40e_osdep.h     | 5 +++++
 drivers/net/i40e/i40e_rxtx.c           | 8 ++++----
 drivers/net/i40e/i40e_rxtx_vec_avx2.c  | 4 ++--
 drivers/net/i40e/i40e_rxtx_vec_sse.c   | 4 ++--
 5 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index f51577684..c0307893e 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -85,6 +85,10 @@ New Features
   The functions are provided as a generic stubs and
   x86 specific implementation.
 
+* **Updated Intel i40e driver.**
+
+  Updated the Intel i40e driver to use write combining stores.
+
 Removed Items
 -------------
 
diff --git a/drivers/net/i40e/base/i40e_osdep.h b/drivers/net/i40e/base/i40e_osdep.h
index 58be39677..9b5033024 100644
--- a/drivers/net/i40e/base/i40e_osdep.h
+++ b/drivers/net/i40e/base/i40e_osdep.h
@@ -138,6 +138,11 @@ static inline uint32_t i40e_read_addr(volatile void *addr)
 #define I40E_PCI_REG_WRITE_RELAXED(reg, value)	\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define I40E_PCI_REG_WC_WRITE(reg, value) \
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+#define I40E_PCI_REG_WC_WRITE_RELAXED(reg, value) \
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define I40E_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_GLGEN_STAT)
 #define I40EVF_WRITE_FLUSH(a) I40E_READ_REG(a, I40E_VFGEN_RSTAT)
 
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 60b33d20a..7be9c8c95 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -760,7 +760,7 @@ i40e_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t) ((rx_id == 0) ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -938,7 +938,7 @@ i40e_recv_scattered_pkts(void *rx_queue,
 	if (nb_hold > rxq->rx_free_thresh) {
 		rx_id = (uint16_t)(rx_id == 0 ?
 			(rxq->nb_rx_desc - 1) : (rx_id - 1));
-		I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -1249,7 +1249,7 @@ i40e_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		   (unsigned) tx_id, (unsigned) nb_tx);
 
 	rte_cio_wmb();
-	I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
+	I40E_PCI_REG_WC_WRITE_RELAXED(txq->qtx_tail, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1400,7 +1400,7 @@ tx_xmit_pkts(struct i40e_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 37e7db5d7..7a558fc73 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -134,7 +134,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -921,7 +921,7 @@ i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 698518349..240ce478a 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -86,7 +86,7 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
@@ -733,7 +733,7 @@ i40e_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	I40E_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v12 3/5] common/qat: use WC store to update queue tail registers
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 1/5] " Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 2/5] net/i40e: use WC store to update queue tail registers Radu Nicolau
@ 2020-09-23 14:22   ` Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 4/5] net/ixgbe: " Radu Nicolau
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-09-23 14:22 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Fiona Trahe <fiona.trahe@intel.com>
---
 doc/guides/rel_notes/release_20_11.rst                   | 4 ++++
 drivers/common/qat/qat_adf/adf_transport_access_macros.h | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index c0307893e..3d310572c 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -89,6 +89,10 @@ New Features
 
   Updated the Intel i40e driver to use write combining stores.
 
+* **Updated Intel qat driver.**
+
+  Updated the Intel qat driver to use write combining stores.
+
 Removed Items
 -------------
 
diff --git a/drivers/common/qat/qat_adf/adf_transport_access_macros.h b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
index 1eef5513f..504ffb723 100644
--- a/drivers/common/qat/qat_adf/adf_transport_access_macros.h
+++ b/drivers/common/qat/qat_adf/adf_transport_access_macros.h
@@ -9,6 +9,8 @@
 /* CSR write macro */
 #define ADF_CSR_WR(csrAddr, csrOffset, val)		\
 	rte_write32(val, (((uint8_t *)csrAddr) + csrOffset))
+#define ADF_CSR_WC_WR(csrAddr, csrOffset, val)		\
+	rte_write32_wc(val, (((uint8_t *)csrAddr) + csrOffset))
 
 /* CSR read macro */
 #define ADF_CSR_RD(csrAddr, csrOffset)			\
@@ -110,10 +112,10 @@ do { \
 		ADF_RING_CSR_RING_UBASE + (ring << 2), u_base);	\
 } while (0)
 #define WRITE_CSR_RING_HEAD(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_HEAD + (ring << 2), value)
 #define WRITE_CSR_RING_TAIL(csr_base_addr, bank, ring, value) \
-	ADF_CSR_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
+	ADF_CSR_WC_WR(csr_base_addr, (ADF_RING_BUNDLE_SIZE * bank) + \
 		ADF_RING_CSR_RING_TAIL + (ring << 2), value)
 #define WRITE_CSR_INT_SRCSEL(csr_base_addr, bank) \
 do { \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v12 4/5] net/ixgbe: use WC store to update queue tail registers
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
                     ` (2 preceding siblings ...)
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 3/5] common/qat: " Radu Nicolau
@ 2020-09-23 14:22   ` Radu Nicolau
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 5/5] net/ice: " Radu Nicolau
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-09-23 14:22 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Reviewed-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_20_11.rst |  4 ++++
 drivers/net/ixgbe/base/ixgbe_osdep.h   |  6 ++++++
 drivers/net/ixgbe/ixgbe_rxtx.c         | 15 ++++++++-------
 drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c |  4 ++--
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index 3d310572c..caca04208 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -93,6 +93,10 @@ New Features
 
   Updated the Intel qat driver to use write combining stores.
 
+* **Updated Intel ixgbe driver.**
+
+  Updated the Intel ixgbe driver to use write combining stores.
+
 Removed Items
 -------------
 
diff --git a/drivers/net/ixgbe/base/ixgbe_osdep.h b/drivers/net/ixgbe/base/ixgbe_osdep.h
index dc712b7c0..cacf72419 100644
--- a/drivers/net/ixgbe/base/ixgbe_osdep.h
+++ b/drivers/net/ixgbe/base/ixgbe_osdep.h
@@ -105,6 +105,12 @@ static inline uint32_t ixgbe_read_addr(volatile void* addr)
 #define IXGBE_PCI_REG_WRITE_RELAXED(reg, value)		\
 	rte_write32_relaxed((rte_cpu_to_le_32(value)), reg)
 
+#define IXGBE_PCI_REG_WC_WRITE(reg, value)			\
+	rte_write32_wc((rte_cpu_to_le_32(value)), reg)
+
+#define IXGBE_PCI_REG_WC_WRITE_RELAXED(reg, value)		\
+	rte_write32_wc_relaxed((rte_cpu_to_le_32(value)), reg)
+
 #define IXGBE_PCI_REG_ADDR(hw, reg) \
 	((volatile uint32_t *)((char *)(hw)->hw_addr + (reg)))
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx.c b/drivers/net/ixgbe/ixgbe_rxtx.c
index 977ecf513..29d385c06 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx.c
@@ -308,7 +308,7 @@ tx_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* update tail pointer */
 	rte_wmb();
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
@@ -946,7 +946,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
 	PMD_TX_LOG(DEBUG, "port_id=%u queue_id=%u tx_tail=%u nb_tx=%u",
 		   (unsigned) txq->port_id, (unsigned) txq->queue_id,
 		   (unsigned) tx_id, (unsigned) nb_tx);
-	IXGBE_PCI_REG_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
+	IXGBE_PCI_REG_WC_WRITE_RELAXED(txq->tdt_reg_addr, tx_id);
 	txq->tx_tail = tx_id;
 
 	return nb_tx;
@@ -1692,7 +1692,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		/* update tail pointer */
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr,
 					    cur_free_trigger);
 	}
 
@@ -1918,7 +1918,7 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 			   (unsigned) nb_rx);
 		rx_id = (uint16_t) ((rx_id == 0) ?
 				     (rxq->nb_rx_desc - 1) : (rx_id - 1));
-		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+		IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2096,8 +2096,9 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 
 			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
 				rte_wmb();
-				IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr,
-							    next_rdt);
+				IXGBE_PCI_REG_WC_WRITE_RELAXED(
+							rxq->rdt_reg_addr,
+							next_rdt);
 				nb_hold -= rxq->rx_free_thresh;
 			} else {
 				PMD_RX_LOG(DEBUG, "RX bulk alloc failed "
@@ -2262,7 +2263,7 @@ ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts,
 			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
 
 		rte_wmb();
-		IXGBE_PCI_REG_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
+		IXGBE_PCI_REG_WC_WRITE_RELAXED(rxq->rdt_reg_addr, prev_id);
 		nb_hold = 0;
 	}
 
diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
index 517ca3166..e77a7f31c 100644
--- a/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
+++ b/drivers/net/ixgbe/ixgbe_rxtx_vec_sse.c
@@ -90,7 +90,7 @@ ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
+	IXGBE_PCI_REG_WC_WRITE(rxq->rdt_reg_addr, rx_id);
 }
 
 #ifdef RTE_LIBRTE_SECURITY
@@ -697,7 +697,7 @@ ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	IXGBE_PCI_REG_WRITE(txq->tdt_reg_addr, txq->tx_tail);
+	IXGBE_PCI_REG_WC_WRITE(txq->tdt_reg_addr, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [dpdk-dev] [PATCH v12 5/5] net/ice: use WC store to update queue tail registers
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
                     ` (3 preceding siblings ...)
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 4/5] net/ixgbe: " Radu Nicolau
@ 2020-09-23 14:22   ` Radu Nicolau
  2020-10-08  7:28   ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions David Marchand
  2020-10-13 12:50   ` David Marchand
  6 siblings, 0 replies; 76+ messages in thread
From: Radu Nicolau @ 2020-09-23 14:22 UTC (permalink / raw)
  To: dev
  Cc: beilei.xing, jia.guo, bruce.richardson, konstantin.ananyev,
	jerinjacobk, david.marchand, fiona.trahe, wei.zhao1,
	ruifeng.wang, qiming.yang, qi.z.zhang, Radu Nicolau

Performance improvement: use a write combining store
instead of a regular mmio write to update queue tail
registers.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Reviewed-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/rel_notes/release_20_11.rst | 4 ++++
 drivers/net/ice/base/ice_osdep.h       | 1 +
 drivers/net/ice/ice_rxtx.c             | 6 +++---
 drivers/net/ice/ice_rxtx_vec_avx2.c    | 4 ++--
 drivers/net/ice/ice_rxtx_vec_sse.c     | 4 ++--
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index caca04208..9d6e07474 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -97,6 +97,10 @@ New Features
 
   Updated the Intel ixgbe driver to use write combining stores.
 
+* **Updated Intel ice driver.**
+
+  Updated the Intel ice driver to use write combining stores.
+
 Removed Items
 -------------
 
diff --git a/drivers/net/ice/base/ice_osdep.h b/drivers/net/ice/base/ice_osdep.h
index 9a170b514..c0f1e7725 100644
--- a/drivers/net/ice/base/ice_osdep.h
+++ b/drivers/net/ice/base/ice_osdep.h
@@ -165,6 +165,7 @@ do {									\
 #endif
 
 #define ICE_PCI_REG_WRITE(reg, value) writel(value, reg)
+#define ICE_PCI_REG_WC_WRITE(reg, value) rte_write32_wc(value, reg)
 
 #define ICE_READ_REG(hw, reg)         rd32(hw, reg)
 #define ICE_WRITE_REG(hw, reg, value) wr32(hw, reg, value)
diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index fef6ad454..6bd5b4a0c 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -1788,7 +1788,7 @@ ice_recv_scattered_pkts(void *rx_queue,
 		rx_id = (uint16_t)(rx_id == 0 ?
 				   (rxq->nb_rx_desc - 1) : (rx_id - 1));
 		/* write TAIL register */
-		ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2178,7 +2178,7 @@ ice_recv_pkts(void *rx_queue,
 		rx_id = (uint16_t)(rx_id == 0 ?
 				   (rxq->nb_rx_desc - 1) : (rx_id - 1));
 		/* write TAIL register */
-		ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+		ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 		nb_hold = 0;
 	}
 	rxq->nb_rx_hold = nb_hold;
@@ -2893,7 +2893,7 @@ tx_xmit_pkts(struct ice_tx_queue *txq,
 		txq->tx_tail = 0;
 
 	/* Update the tx tail register */
-	ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index 5969a3048..b72a9e702 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -129,7 +129,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 static inline __m256i
@@ -962,7 +962,7 @@ ice_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index c4c9a9126..1afd96ac9 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -97,7 +97,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
 			   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 
 	/* Update the tail pointer on the NIC */
-	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+	ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
 static inline void
@@ -689,7 +689,7 @@ ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	txq->tx_tail = tx_id;
 
-	ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+	ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
 	return nb_pkts;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
                     ` (4 preceding siblings ...)
  2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 5/5] net/ice: " Radu Nicolau
@ 2020-10-08  7:28   ` David Marchand
  2020-10-08  9:51     ` Nicolau, Radu
  2020-10-13  8:57     ` Ferruh Yigit
  2020-10-13 12:50   ` David Marchand
  6 siblings, 2 replies; 76+ messages in thread
From: David Marchand @ 2020-10-08  7:28 UTC (permalink / raw)
  To: Radu Nicolau
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob, Trahe, Fiona, Wei Zhao,
	Ruifeng Wang (Arm Technology China),
	Qiming Yang, Qi Zhang, Yigit, Ferruh, Akhil Goyal,
	David Christensen

On Wed, Sep 23, 2020 at 4:23 PM Radu Nicolau <radu.nicolau@intel.com> wrote:
>
> Implement 2 new functions that will enable write combining
> stores depending on architecture. The functions are provided
> as a generic stub and a x86 specific implementation.
>
> The reason to implement these functions is to improve performance
> by reducing the overhead associated with regular mmio writes when
> updating the hardware queue tails and doorbells.

For the record, on which CPU/platform was this tested and how much of
an improvement did you get with this?

I did not see review/ack tokens from other arch maintainers, but since
it has been on the ml for a while, I guess I can proceed as is.


>
> With this patch set the I40E, ICE, IXGBE and QAT PMDs are updated to
> use the write combining store functions with other PMDs to follow.

This series will go through the main repo: copying Ferruh and Akhil for info.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions
  2020-10-08  7:28   ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions David Marchand
@ 2020-10-08  9:51     ` Nicolau, Radu
  2020-10-13  8:57     ` Ferruh Yigit
  1 sibling, 0 replies; 76+ messages in thread
From: Nicolau, Radu @ 2020-10-08  9:51 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob, Trahe, Fiona, Wei Zhao,
	Ruifeng Wang (Arm Technology China),
	Qiming Yang, Qi Zhang, Yigit, Ferruh, Akhil Goyal,
	David Christensen


On 10/8/2020 8:28 AM, David Marchand wrote:
> On Wed, Sep 23, 2020 at 4:23 PM Radu Nicolau <radu.nicolau@intel.com> wrote:
>> Implement 2 new functions that will enable write combining
>> stores depending on architecture. The functions are provided
>> as a generic stub and a x86 specific implementation.
>>
>> The reason to implement these functions is to improve performance
>> by reducing the overhead associated with regular mmio writes when
>> updating the hardware queue tails and doorbells.
> For the record, on which CPU/platform was this tested and how much of
> an improvement did you get with this?

The improvement varies a lot with the particular usecase and the PMD, so 
it's difficult to state a number, but there were cases with performance 
improvements going well into the double digits, with very small bursts 
applications seeing the most benefits. Tests were done on a Snow Ridge 
platform.


>
> I did not see review/ack tokens from other arch maintainers, but since
> it has been on the ml for a while, I guess I can proceed as is.
>
>
>> With this patch set the I40E, ICE, IXGBE and QAT PMDs are updated to
>> use the write combining store functions with other PMDs to follow.
> This series will go through the main repo: copying Ferruh and Akhil for info.
>
>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions
  2020-10-08  7:28   ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions David Marchand
  2020-10-08  9:51     ` Nicolau, Radu
@ 2020-10-13  8:57     ` Ferruh Yigit
  1 sibling, 0 replies; 76+ messages in thread
From: Ferruh Yigit @ 2020-10-13  8:57 UTC (permalink / raw)
  To: David Marchand, Radu Nicolau
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob, Trahe, Fiona, Wei Zhao,
	Ruifeng Wang (Arm Technology China),
	Qiming Yang, Qi Zhang, Akhil Goyal, David Christensen

On 10/8/2020 8:28 AM, David Marchand wrote:
> On Wed, Sep 23, 2020 at 4:23 PM Radu Nicolau <radu.nicolau@intel.com> wrote:
>>
>> Implement 2 new functions that will enable write combining
>> stores depending on architecture. The functions are provided
>> as a generic stub and a x86 specific implementation.
>>
>> The reason to implement these functions is to improve performance
>> by reducing the overhead associated with regular mmio writes when
>> updating the hardware queue tails and doorbells.
> 
> For the record, on which CPU/platform was this tested and how much of
> an improvement did you get with this?
> 
> I did not see review/ack tokens from other arch maintainers, but since
> it has been on the ml for a while, I guess I can proceed as is.
> 
> 
>>
>> With this patch set the I40E, ICE, IXGBE and QAT PMDs are updated to
>> use the write combining store functions with other PMDs to follow.
> 
> This series will go through the main repo: copying Ferruh and Akhil for info.
> 

Sounds good to me, +1 to not separate the driver implementation from actual change.


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions
  2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
                     ` (5 preceding siblings ...)
  2020-10-08  7:28   ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions David Marchand
@ 2020-10-13 12:50   ` David Marchand
  6 siblings, 0 replies; 76+ messages in thread
From: David Marchand @ 2020-10-13 12:50 UTC (permalink / raw)
  To: Radu Nicolau
  Cc: dev, Beilei Xing, Jeff Guo, Bruce Richardson, Ananyev,
	Konstantin, Jerin Jacob, Trahe, Fiona, Wei Zhao,
	Ruifeng Wang (Arm Technology China),
	Qiming Yang, Qi Zhang, Yigit, Ferruh, Akhil Goyal

On Wed, Sep 23, 2020 at 4:23 PM Radu Nicolau <radu.nicolau@intel.com> wrote:
>
> Implement 2 new functions that will enable write combining
> stores depending on architecture. The functions are provided
> as a generic stub and a x86 specific implementation.
>
> The reason to implement these functions is to improve performance
> by reducing the overhead associated with regular mmio writes when
> updating the hardware queue tails and doorbells.
>
> With this patch set the I40E, ICE, IXGBE and QAT PMDs are updated to
> use the write combining store functions with other PMDs to follow.
>
>
> Radu Nicolau (5):
>   eal: add WC store functions
>   net/i40e: use WC store to update queue tail registers
>   common/qat: use WC store to update queue tail registers
>   net/ixgbe: use WC store to update queue tail registers
>   net/ice: use WC store to update queue tail registers

Series applied.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 76+ messages in thread

end of thread, other threads:[~2020-10-13 12:51 UTC | newest]

Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-11 10:11 [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Radu Nicolau
2020-06-11 10:11 ` [dpdk-dev] [PATCH v1 2/2] net/i40e: use movdiri to update queue tail registers Radu Nicolau
2020-06-11 12:23 ` [dpdk-dev] [PATCH v1 1/2] eal/x86: add WC store function Jerin Jacob
2020-06-11 13:56   ` Nicolau, Radu
2020-06-11 15:33     ` Jerin Jacob
2020-06-15 11:11 ` Ananyev, Konstantin
2020-06-19 12:06 ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Radu Nicolau
2020-06-19 12:06   ` [dpdk-dev] [PATCH v2 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-01 13:15     ` Bruce Richardson
2020-07-01 13:14   ` [dpdk-dev] [PATCH v2 1/2] eal: add WC store functions Bruce Richardson
2020-07-01 14:15 ` [dpdk-dev] [PATCH v3 0/2] " Radu Nicolau
2020-07-01 14:15   ` [dpdk-dev] [PATCH v3 1/2] " Radu Nicolau
2020-07-01 14:15   ` [dpdk-dev] [PATCH v3 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-02  9:23 ` [dpdk-dev] [PATCH v4 0/2] eal: add WC store functions Radu Nicolau
2020-07-02  9:23   ` [dpdk-dev] [PATCH v4 1/2] " Radu Nicolau
2020-07-03 15:19     ` David Marchand
2020-07-06  9:15       ` Nicolau, Radu
2020-07-02  9:23   ` [dpdk-dev] [PATCH v4 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-06 12:29 ` [dpdk-dev] [PATCH v5 0/2] eal: add WC store functions Radu Nicolau
2020-07-06 12:29   ` [dpdk-dev] [PATCH v5 1/2] " Radu Nicolau
2020-07-06 12:30   ` [dpdk-dev] [PATCH v5 2/2] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-13 12:27 ` [dpdk-dev] [PATCH v6 0/4] eal: add WC store functions Radu Nicolau
2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 1/4] " Radu Nicolau
2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 3/4] qat: " Radu Nicolau
2020-07-13 12:44     ` Bruce Richardson
2020-07-13 12:52       ` Trahe, Fiona
2020-07-13 12:57         ` Bruce Richardson
2020-07-13 12:27   ` [dpdk-dev] [PATCH v6 4/4] net/ixgbe: use WC store to update doorbell register Radu Nicolau
2020-07-16 12:29 ` [dpdk-dev] [PATCH v7 0/4] eal: add WC store functions Radu Nicolau
2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 1/4] " Radu Nicolau
2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 3/4] common/qat: " Radu Nicolau
2020-07-16 12:29   ` [dpdk-dev] [PATCH v7 4/4] net/ixgbe: use WC store to update doorbell register Radu Nicolau
2020-07-17 10:49 ` [dpdk-dev] [PATCH v8 0/4] eal: add WC store functions Radu Nicolau
2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 1/4] " Radu Nicolau
2020-07-20  6:42     ` Ruifeng Wang
2020-07-20  8:52       ` Nicolau, Radu
2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-20  6:46     ` Ruifeng Wang
2020-07-20  8:54       ` Nicolau, Radu
2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 3/4] common/qat: " Radu Nicolau
2020-07-17 16:42     ` Trahe, Fiona
2020-07-17 10:49   ` [dpdk-dev] [PATCH v8 4/4] net/ixgbe: " Radu Nicolau
2020-07-17 11:18     ` Ananyev, Konstantin
2020-07-20  9:12 ` [dpdk-dev] [PATCH v9 0/4] eal: add WC store functions Radu Nicolau
2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 1/4] " Radu Nicolau
2020-07-20 12:20     ` David Marchand
2020-07-21  8:56       ` Nicolau, Radu
2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 3/4] common/qat: " Radu Nicolau
2020-07-20  9:12   ` [dpdk-dev] [PATCH v9 4/4] net/ixgbe: " Radu Nicolau
2020-07-21 11:31 ` [dpdk-dev] [PATCH v10 0/4] eal: add WC store functions Radu Nicolau
2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 1/4] " Radu Nicolau
2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 2/4] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 3/4] common/qat: " Radu Nicolau
2020-07-21 11:31   ` [dpdk-dev] [PATCH v10 4/4] net/ixgbe: " Radu Nicolau
2020-08-26  9:55 ` [dpdk-dev] [PATCH v11 0/5] eal: add WC store functions Radu Nicolau
2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 1/5] " Radu Nicolau
2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 2/5] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-09-23  1:19     ` Lu, Wenzhuo
2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 3/5] common/qat: " Radu Nicolau
2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 4/5] net/ixgbe: " Radu Nicolau
2020-09-23  1:20     ` Lu, Wenzhuo
2020-08-26  9:55   ` [dpdk-dev] [PATCH v11 5/5] net/ice: " Radu Nicolau
2020-09-23  1:20     ` Lu, Wenzhuo
2020-09-23 14:22 ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions Radu Nicolau
2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 1/5] " Radu Nicolau
2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 2/5] net/i40e: use WC store to update queue tail registers Radu Nicolau
2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 3/5] common/qat: " Radu Nicolau
2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 4/5] net/ixgbe: " Radu Nicolau
2020-09-23 14:22   ` [dpdk-dev] [PATCH v12 5/5] net/ice: " Radu Nicolau
2020-10-08  7:28   ` [dpdk-dev] [PATCH v12 0/5] eal: add WC store functions David Marchand
2020-10-08  9:51     ` Nicolau, Radu
2020-10-13  8:57     ` Ferruh Yigit
2020-10-13 12:50   ` David Marchand

DPDK patches and discussions

This inbox may be cloned and mirrored by anyone:

	git clone --mirror https://inbox.dpdk.org/dev/0 dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dev dev/ https://inbox.dpdk.org/dev \
		dev@dpdk.org
	public-inbox-index dev

Example config snippet for mirrors.
Newsgroup available over NNTP:
	nntp://inbox.dpdk.org/inbox.dpdk.dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git