* [dpdk-stable] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
@ 2019-01-18 10:29 Shahed Shaikh
2019-01-18 10:29 ` [dpdk-stable] [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Shahed Shaikh @ 2019-01-18 10:29 UTC (permalink / raw)
To: dev; +Cc: ferruh.yigit, stable
Allocating replacement buffer per received packet is expensive.
Instead, process received packets first and allocate
replacement buffers in bulk later.
This improves performance by ~25% in terms of PPS on AMD
platforms.
Fixes: 2ea6f76aff40 ("qede: add core driver")
Cc: stable@dpdk.org
Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
---
drivers/net/qede/qede_rxtx.c | 97 +++++++++++++++++++++++++++++++++-----------
drivers/net/qede/qede_rxtx.h | 2 +
2 files changed, 75 insertions(+), 24 deletions(-)
diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 0e33be1..684c4ae 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
return 0;
}
+#define QEDE_MAX_BULK_ALLOC_COUNT 512
+
+static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq, int count)
+{
+ void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
+ struct rte_mbuf *mbuf = NULL;
+ struct eth_rx_bd *rx_bd;
+ dma_addr_t mapping;
+ int i, ret = 0;
+ uint16_t idx;
+
+ idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+
+ if (count > QEDE_MAX_BULK_ALLOC_COUNT)
+ count = QEDE_MAX_BULK_ALLOC_COUNT;
+
+ ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
+ if (unlikely(ret)) {
+ PMD_RX_LOG(ERR, rxq,
+ "Failed to allocate %d rx buffers "
+ "sw_rx_prod %u sw_rx_cons %u mp entries %u free %u",
+ count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
+ rte_mempool_avail_count(rxq->mb_pool),
+ rte_mempool_in_use_count(rxq->mb_pool));
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < count; i++) {
+ mbuf = obj_p[i];
+ if (likely(i < count - 1))
+ rte_prefetch0(obj_p[i + 1]);
+
+ idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
+ rxq->sw_rx_ring[idx].mbuf = mbuf;
+ rxq->sw_rx_ring[idx].page_offset = 0;
+ mapping = rte_mbuf_data_iova_default(mbuf);
+ rx_bd = (struct eth_rx_bd *)
+ ecore_chain_produce(&rxq->rx_bd_ring);
+ rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
+ rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
+ rxq->sw_rx_prod++;
+ }
+
+ return 0;
+}
+
/* Criterias for calculating Rx buffer size -
* 1) rx_buf_size should not exceed the size of mbuf
* 2) In scattered_rx mode - minimum rx_buf_size should be
@@ -1131,7 +1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
{
struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring);
- uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
+ uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
struct qede_rx_entry *curr_prod;
dma_addr_t new_mapping;
@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
uint8_t bitfield_val;
#endif
uint8_t tunn_parse_flag;
- uint8_t j;
struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
uint64_t ol_flags;
uint32_t packet_type;
@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
uint8_t offset, tpa_agg_idx, flags;
struct qede_agg_info *tpa_info = NULL;
uint32_t rss_hash;
+ int rx_alloc_count = 0;
hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (hw_comp_cons == sw_comp_cons)
return 0;
+ /* Allocate buffers that we used in previous loop */
+ if (rxq->rx_alloc_count) {
+ if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
+ rxq->rx_alloc_count))) {
+ struct rte_eth_dev *dev;
+
+ PMD_RX_LOG(ERR, rxq,
+ "New buffer allocation failed,"
+ "dropping incoming packetn");
+ dev = &rte_eth_devices[rxq->port_id];
+ dev->data->rx_mbuf_alloc_failed +=
+ rxq->rx_alloc_count;
+ rxq->rx_alloc_errors += rxq->rx_alloc_count;
+ return 0;
+ }
+ qede_update_rx_prod(qdev, rxq);
+ rxq->rx_alloc_count = 0;
+ }
+
while (sw_comp_cons != hw_comp_cons) {
ol_flags = 0;
packet_type = RTE_PTYPE_UNKNOWN;
@@ -1553,16 +1618,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
rx_mb->hash.rss = rss_hash;
}
- if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
- PMD_RX_LOG(ERR, rxq,
- "New buffer allocation failed,"
- "dropping incoming packet\n");
- qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
- rte_eth_devices[rxq->port_id].
- data->rx_mbuf_alloc_failed++;
- rxq->rx_alloc_errors++;
- break;
- }
+ rx_alloc_count++;
qede_rx_bd_ring_consume(rxq);
if (!tpa_start_flg && fp_cqe->bd_num > 1) {
@@ -1574,17 +1630,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
pkt_len - len))
goto next_cqe;
- for (j = 0; j < num_segs; j++) {
- if (qede_alloc_rx_buffer(rxq)) {
- PMD_RX_LOG(ERR, rxq,
- "Buffer allocation failed");
- rte_eth_devices[rxq->port_id].
- data->rx_mbuf_alloc_failed++;
- rxq->rx_alloc_errors++;
- break;
- }
- rxq->rx_segs++;
- }
+
+ rx_alloc_count += num_segs;
+ rxq->rx_segs += num_segs;
}
rxq->rx_segs++; /* for the first segment */
@@ -1626,7 +1674,8 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
}
}
- qede_update_rx_prod(qdev, rxq);
+ /* Request number of bufferes to be allocated in next loop */
+ rxq->rx_alloc_count = rx_alloc_count;
rxq->rcv_pkts += rx_pkt;
diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h
index 454daa0..5b249cb 100644
--- a/drivers/net/qede/qede_rxtx.h
+++ b/drivers/net/qede/qede_rxtx.h
@@ -192,6 +192,8 @@ struct qede_rx_queue {
uint16_t queue_id;
uint16_t port_id;
uint16_t rx_buf_size;
+ uint16_t rx_alloc_count;
+ uint16_t unused;
uint64_t rcv_pkts;
uint64_t rx_segs;
uint64_t rx_hw_errors;
--
2.7.4
^ permalink raw reply [flat|nested] 5+ messages in thread
* [dpdk-stable] [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path
2019-01-18 10:29 [dpdk-stable] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
@ 2019-01-18 10:29 ` Shahed Shaikh
2019-01-18 14:41 ` [dpdk-stable] [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
[not found] ` <BYAPR18MB28388FB930DFE1C9FF5F4A0FB59C0@BYAPR18MB2838.namprd18.prod.outlook.com>
2 siblings, 0 replies; 5+ messages in thread
From: Shahed Shaikh @ 2019-01-18 10:29 UTC (permalink / raw)
To: dev; +Cc: ferruh.yigit, stable
Incorrect placement of prefetch in Tx path is causing
a performance drop of around ~2% on AMD platform.
Fixes: 2ea6f76aff40 ("qede: add core driver")
Cc: stable@dpdk.org
Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
---
drivers/net/qede/qede_rxtx.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c
index 684c4ae..eda19b2 100644
--- a/drivers/net/qede/qede_rxtx.c
+++ b/drivers/net/qede/qede_rxtx.c
@@ -2181,7 +2181,6 @@ qede_xmit_pkts(void *p_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
txq->nb_tx_avail -= bd1->data.nbds;
txq->sw_tx_prod++;
- rte_prefetch0(txq->sw_tx_ring[TX_PROD(txq)].mbuf);
bd_prod =
rte_cpu_to_le_16(ecore_chain_get_prod_idx(&txq->tx_pbl));
#ifdef RTE_LIBRTE_QEDE_DEBUG_TX
--
2.7.4
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-stable] [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
2019-01-18 10:29 [dpdk-stable] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
2019-01-18 10:29 ` [dpdk-stable] [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
@ 2019-01-18 14:41 ` Ferruh Yigit
2019-01-18 14:41 ` Ferruh Yigit
[not found] ` <BYAPR18MB28388FB930DFE1C9FF5F4A0FB59C0@BYAPR18MB2838.namprd18.prod.outlook.com>
2 siblings, 1 reply; 5+ messages in thread
From: Ferruh Yigit @ 2019-01-18 14:41 UTC (permalink / raw)
To: Shahed Shaikh, dev; +Cc: stable, Rasesh Mody
On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
> Allocating replacement buffer per received packet is expensive.
> Instead, process received packets first and allocate
> replacement buffers in bulk later.
>
> This improves performance by ~25% in terms of PPS on AMD
> platforms.
>
> Fixes: 2ea6f76aff40 ("qede: add core driver")
> Cc: stable@dpdk.org
>
> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
Hi Shahed,
This patch has been sent same day of the RC3, very close to the actual release,
and updating data path of the driver, so not a trivial patch.
Although I tend to accept driver patches late in process this one was too late,
you won't have proper time to fix if any errors detected, but also I can see it
has a good amount performance effect.
I would like to explicitly ask if you are willing to take the risk. If answer is
yes, please also be sure to get Rasesh's ack.
Thanks,
ferruh
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-stable] [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
2019-01-18 14:41 ` [dpdk-stable] [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
@ 2019-01-18 14:41 ` Ferruh Yigit
0 siblings, 0 replies; 5+ messages in thread
From: Ferruh Yigit @ 2019-01-18 14:41 UTC (permalink / raw)
To: Shahed Shaikh, dev; +Cc: stable, Rasesh Mody, Thomas Monjalon
On 1/18/2019 2:41 PM, Ferruh Yigit wrote:
> On 1/18/2019 10:29 AM, Shahed Shaikh wrote:
>> Allocating replacement buffer per received packet is expensive.
>> Instead, process received packets first and allocate
>> replacement buffers in bulk later.
>>
>> This improves performance by ~25% in terms of PPS on AMD
>> platforms.
>>
>> Fixes: 2ea6f76aff40 ("qede: add core driver")
>> Cc: stable@dpdk.org
>>
>> Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
>
> Hi Shahed,
>
> This patch has been sent same day of the RC3, very close to the actual release,
> and updating data path of the driver, so not a trivial patch.
>
> Although I tend to accept driver patches late in process this one was too late,
> you won't have proper time to fix if any errors detected, but also I can see it
> has a good amount performance effect.
>
> I would like to explicitly ask if you are willing to take the risk. If answer is
> yes, please also be sure to get Rasesh's ack.
cc'ed Thomas.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [dpdk-stable] [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path
[not found] ` <BYAPR18MB28388FB930DFE1C9FF5F4A0FB59C0@BYAPR18MB2838.namprd18.prod.outlook.com>
@ 2019-01-18 23:39 ` Thomas Monjalon
0 siblings, 0 replies; 5+ messages in thread
From: Thomas Monjalon @ 2019-01-18 23:39 UTC (permalink / raw)
To: Rasesh Mody, Shahed Shaikh; +Cc: dev, ferruh.yigit, stable
18/01/2019 17:57, Rasesh Mody:
> >From: dev <dev-bounces@dpdk.org> On Behalf Of Shahed Shaikh
> >Sent: Friday, January 18, 2019 2:29 AM
> >
> >Allocating replacement buffer per received packet is expensive.
> >Instead, process received packets first and allocate replacement buffers in
> >bulk later.
> >
> >This improves performance by ~25% in terms of PPS on AMD platforms.
> >
> >Fixes: 2ea6f76aff40 ("qede: add core driver")
> >Cc: stable@dpdk.org
> >
> >Signed-off-by: Shahed Shaikh <shshaikh@marvell.com>
Applied, thanks
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2019-01-18 23:40 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-18 10:29 [dpdk-stable] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Shahed Shaikh
2019-01-18 10:29 ` [dpdk-stable] [PATCH 2/2] net/qede: fix prefetch from incorrect place in Tx path Shahed Shaikh
2019-01-18 14:41 ` [dpdk-stable] [dpdk-dev] [PATCH 1/2] net/qede: fix performance bottleneck in Rx path Ferruh Yigit
2019-01-18 14:41 ` Ferruh Yigit
[not found] ` <BYAPR18MB28388FB930DFE1C9FF5F4A0FB59C0@BYAPR18MB2838.namprd18.prod.outlook.com>
2019-01-18 23:39 ` Thomas Monjalon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).