* [dpdk-dev] [PATCH 1/7] net/mlx5: prepare Tx vectorization
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 2/7] net/mlx5: use work queue buffer as a raw buffer Nelio Laranjeiro
` (6 subsequent siblings)
7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil, Elad Persiko
Prepare the code to write the Work Queue Element with vectorized
instructions.
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Signed-off-by: Elad Persiko <eladpe@mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 44 ++++++++++++++++++++++++++++----------------
1 file changed, 28 insertions(+), 16 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ffd09ac..5dacd93 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -391,6 +391,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint32_t length;
unsigned int ds = 0;
uintptr_t addr;
+ uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
+ uint8_t ehdr[2];
#ifdef MLX5_PMD_SOFT_COUNTERS
uint32_t total_length = 0;
#endif
@@ -416,6 +418,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_prefetch0(*pkts);
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
+ ehdr[0] = ((uint8_t *)addr)[0];
+ ehdr[1] = ((uint8_t *)addr)[1];
#ifdef MLX5_PMD_SOFT_COUNTERS
total_length = length;
#endif
@@ -439,24 +443,20 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
} else {
wqe->eseg.cs_flags = 0;
}
- raw = (uint8_t *)(uintptr_t)&wqe->eseg.inline_hdr[0];
- /* Start the know and common part of the WQE structure. */
- wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
- wqe->ctrl[2] = 0;
- wqe->ctrl[3] = 0;
- wqe->eseg.rsvd0 = 0;
- wqe->eseg.rsvd1 = 0;
- wqe->eseg.mss = 0;
- wqe->eseg.rsvd2 = 0;
- /* Start by copying the Ethernet Header. */
- memcpy((uint8_t *)raw, ((uint8_t *)addr), 16);
+ raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
+ /*
+ * Start by copying the Ethernet header minus the first two
+ * bytes which will be appended at the end of the Ethernet
+ * segment.
+ */
+ memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2, 16);
length -= MLX5_WQE_DWORD_SIZE;
addr += MLX5_WQE_DWORD_SIZE;
/* Replace the Ethernet type by the VLAN if necessary. */
if (buf->ol_flags & PKT_TX_VLAN_PKT) {
uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
- memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE -
+ memcpy((uint8_t *)(raw + MLX5_WQE_DWORD_SIZE - 2 -
sizeof(vlan)),
&vlan, sizeof(vlan));
addr -= sizeof(vlan);
@@ -468,10 +468,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
(uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
uint16_t max_inline =
txq->max_inline * RTE_CACHE_LINE_SIZE;
- uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
uint16_t room;
- raw += MLX5_WQE_DWORD_SIZE;
+ /*
+ * raw starts two bytes before the boundary to
+ * continue the above copy of packet data.
+ */
+ raw += MLX5_WQE_DWORD_SIZE - 2;
room = end - (uintptr_t)raw;
if (room > max_inline) {
uintptr_t addr_end = (addr + max_inline) &
@@ -487,8 +490,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Sanity check. */
assert(addr <= addr_end);
}
- /* Store the inlined packet size in the WQE. */
- wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
/*
* 2 DWORDs consumed by the WQE header + 1 DSEG +
* the size of the inline part of the packet.
@@ -570,7 +571,18 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
--pkts_n;
next_pkt:
++i;
+ /* Initialize known and common part of the WQE structure. */
+ wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
+ wqe->ctrl[2] = 0;
+ wqe->ctrl[3] = 0;
+ wqe->eseg.rsvd0 = 0;
+ wqe->eseg.rsvd1 = 0;
+ wqe->eseg.mss = 0;
+ wqe->eseg.rsvd2 = 0;
+ wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
+ wqe->eseg.inline_hdr[0] = ehdr[0];
+ wqe->eseg.inline_hdr[1] = ehdr[1];
txq->wqe_ci += (ds + 3) / 4;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread
* [dpdk-dev] [PATCH 2/7] net/mlx5: use work queue buffer as a raw buffer
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 1/7] net/mlx5: prepare Tx vectorization Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 3/7] net/mlx5: use vector types to speed up processing Nelio Laranjeiro
` (5 subsequent siblings)
7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil
Define a single work queue element type that encompasses them all. It
includes control, Ethernet segment and raw data all grouped in a single
place.
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
drivers/net/mlx5/mlx5_prm.h | 13 ++++--
drivers/net/mlx5/mlx5_rxtx.c | 103 ++++++++++++++++++++++---------------------
drivers/net/mlx5/mlx5_rxtx.h | 2 +-
drivers/net/mlx5/mlx5_txq.c | 8 ++--
4 files changed, 68 insertions(+), 58 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 7f31a2f..3dd4cbe 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -114,12 +114,19 @@ struct mlx5_wqe_eth_seg_small {
uint32_t rsvd2;
uint16_t inline_hdr_sz;
uint8_t inline_hdr[2];
-};
+} __rte_aligned(MLX5_WQE_DWORD_SIZE);
struct mlx5_wqe_inl_small {
uint32_t byte_cnt;
uint8_t raw;
-};
+} __rte_aligned(MLX5_WQE_DWORD_SIZE);
+
+struct mlx5_wqe_ctrl {
+ uint32_t ctrl0;
+ uint32_t ctrl1;
+ uint32_t ctrl2;
+ uint32_t ctrl3;
+} __rte_aligned(MLX5_WQE_DWORD_SIZE);
/* Small common part of the WQE. */
struct mlx5_wqe {
@@ -131,7 +138,7 @@ struct mlx5_wqe {
struct mlx5_wqe64 {
struct mlx5_wqe hdr;
uint8_t raw[32];
-} __rte_aligned(64);
+} __rte_aligned(MLX5_WQE_SIZE);
/* MPW session status. */
enum mlx5_mpw_state {
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 5dacd93..ada8e74 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -154,6 +154,24 @@ check_cqe(volatile struct mlx5_cqe *cqe,
return 0;
}
+/**
+ * Return the address of the WQE.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param wqe_ci
+ * WQE consumer index.
+ *
+ * @return
+ * WQE address.
+ */
+static inline uintptr_t *
+tx_mlx5_wqe(struct txq *txq, uint16_t ci)
+{
+ ci &= ((1 << txq->wqe_n) - 1);
+ return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
+}
+
static inline void
txq_complete(struct txq *txq) __attribute__((always_inline));
@@ -175,7 +193,7 @@ txq_complete(struct txq *txq)
uint16_t elts_tail;
uint16_t cq_ci = txq->cq_ci;
volatile struct mlx5_cqe *cqe = NULL;
- volatile struct mlx5_wqe *wqe;
+ volatile struct mlx5_wqe_ctrl *ctrl;
do {
volatile struct mlx5_cqe *tmp;
@@ -201,9 +219,9 @@ txq_complete(struct txq *txq)
} while (1);
if (unlikely(cqe == NULL))
return;
- wqe = &(*txq->wqes)[ntohs(cqe->wqe_counter) &
- ((1 << txq->wqe_n) - 1)].hdr;
- elts_tail = wqe->ctrl[3];
+ ctrl = (volatile struct mlx5_wqe_ctrl *)
+ tx_mlx5_wqe(txq, ntohs(cqe->wqe_counter));
+ elts_tail = ctrl->ctrl3;
assert(elts_tail < (1 << txq->wqe_n));
/* Free buffers. */
while (elts_free != elts_tail) {
@@ -331,23 +349,6 @@ tx_prefetch_cqe(struct txq *txq, uint16_t ci)
}
/**
- * Prefetch a WQE.
- *
- * @param txq
- * Pointer to TX queue structure.
- * @param wqe_ci
- * WQE consumer index.
- */
-static inline void
-tx_prefetch_wqe(struct txq *txq, uint16_t ci)
-{
- volatile struct mlx5_wqe64 *wqe;
-
- wqe = &(*txq->wqes)[ci & ((1 << txq->wqe_n) - 1)];
- rte_prefetch0(wqe);
-}
-
-/**
* DPDK callback for TX.
*
* @param dpdk_txq
@@ -411,9 +412,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
--segs_n;
if (!segs_n)
--pkts_n;
- wqe = &(*txq->wqes)[txq->wqe_ci &
- ((1 << txq->wqe_n) - 1)].hdr;
- tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ wqe = (volatile struct mlx5_wqe *)
+ tx_mlx5_wqe(txq, txq->wqe_ci);
+ rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
if (pkts_n > 1)
rte_prefetch0(*pkts);
addr = rte_pktmbuf_mtod(buf, uintptr_t);
@@ -464,8 +465,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
}
/* Inline if enough room. */
if (txq->max_inline != 0) {
- uintptr_t end =
- (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n];
+ uintptr_t end = (uintptr_t)
+ (((uintptr_t)txq->wqes) +
+ (1 << txq->wqe_n) * MLX5_WQE_SIZE);
uint16_t max_inline =
txq->max_inline * RTE_CACHE_LINE_SIZE;
uint16_t room;
@@ -496,12 +498,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
*/
ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
if (length > 0) {
- dseg = (struct mlx5_wqe_data_seg *)
+ dseg = (volatile struct mlx5_wqe_data_seg *)
((uintptr_t)wqe +
(ds * MLX5_WQE_DWORD_SIZE));
if ((uintptr_t)dseg >= end)
- dseg = (struct mlx5_wqe_data_seg *)
- ((uintptr_t)&(*txq->wqes)[0]);
+ dseg = (volatile struct
+ mlx5_wqe_data_seg *)
+ txq->wqes;
goto use_dseg;
} else if (!segs_n) {
goto next_pkt;
@@ -514,12 +517,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
* Ethernet Header as been stored.
*/
wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
- dseg = (struct mlx5_wqe_data_seg *)
+ dseg = (volatile struct mlx5_wqe_data_seg *)
((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
ds = 3;
use_dseg:
/* Add the remaining packet as a simple ds. */
- *dseg = (struct mlx5_wqe_data_seg) {
+ *dseg = (volatile struct mlx5_wqe_data_seg) {
.addr = htonll(addr),
.byte_count = htonl(length),
.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
@@ -542,9 +545,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
((1 << txq->wqe_n) - 1);
- dseg = (struct mlx5_wqe_data_seg *)
- ((uintptr_t)&(*txq->wqes)[n]);
- tx_prefetch_wqe(txq, n + 1);
+ dseg = (volatile struct mlx5_wqe_data_seg *)
+ tx_mlx5_wqe(txq, n);
+ rte_prefetch0(tx_mlx5_wqe(txq, n + 1));
} else {
++dseg;
}
@@ -556,7 +559,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
total_length += length;
#endif
/* Store segment information. */
- *dseg = (struct mlx5_wqe_data_seg) {
+ *dseg = (volatile struct mlx5_wqe_data_seg) {
.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
.byte_count = htonl(length),
.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
@@ -629,13 +632,13 @@ mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
(volatile struct mlx5_wqe_data_seg (*)[])
- (uintptr_t)&(*txq->wqes)[(idx + 1) & ((1 << txq->wqe_n) - 1)];
+ tx_mlx5_wqe(txq, idx + 1);
mpw->state = MLX5_MPW_STATE_OPENED;
mpw->pkts_n = 0;
mpw->len = length;
mpw->total_len = 0;
- mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+ mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
mpw->wqe->eseg.mss = htons(length);
mpw->wqe->eseg.inline_hdr_sz = 0;
mpw->wqe->eseg.rsvd0 = 0;
@@ -677,8 +680,8 @@ mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
++txq->wqe_ci;
else
txq->wqe_ci += 2;
- tx_prefetch_wqe(txq, txq->wqe_ci);
- tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
+ rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
}
/**
@@ -712,8 +715,8 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
- tx_prefetch_wqe(txq, txq->wqe_ci);
- tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
+ rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
@@ -841,7 +844,7 @@ mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
mpw->pkts_n = 0;
mpw->len = length;
mpw->total_len = 0;
- mpw->wqe = (volatile struct mlx5_wqe *)&(*txq->wqes)[idx].hdr;
+ mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
(txq->wqe_ci << 8) |
MLX5_OPCODE_TSO);
@@ -917,8 +920,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
return 0;
/* Prefetch first packet cacheline. */
tx_prefetch_cqe(txq, txq->cq_ci);
- tx_prefetch_wqe(txq, txq->wqe_ci);
- tx_prefetch_wqe(txq, txq->wqe_ci + 1);
+ rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
+ rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
/* Start processing. */
txq_complete(txq);
max = (elts_n - (elts_head - txq->elts_tail));
@@ -1019,14 +1022,15 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
addr = rte_pktmbuf_mtod(buf, uintptr_t);
(*txq->elts)[elts_head] = buf;
/* Maximum number of bytes before wrapping. */
- max = ((uintptr_t)&(*txq->wqes)[1 << txq->wqe_n] -
+ max = ((((uintptr_t)(txq->wqes)) +
+ (1 << txq->wqe_n) *
+ MLX5_WQE_SIZE) -
(uintptr_t)mpw.data.raw);
if (length > max) {
rte_memcpy((void *)(uintptr_t)mpw.data.raw,
(void *)addr,
max);
- mpw.data.raw =
- (volatile void *)&(*txq->wqes)[0];
+ mpw.data.raw = (volatile void *)txq->wqes;
rte_memcpy((void *)(uintptr_t)mpw.data.raw,
(void *)(addr + max),
length - max);
@@ -1038,9 +1042,8 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
mpw.data.raw += length;
}
if ((uintptr_t)mpw.data.raw ==
- (uintptr_t)&(*txq->wqes)[1 << txq->wqe_n])
- mpw.data.raw =
- (volatile void *)&(*txq->wqes)[0];
+ (uintptr_t)tx_mlx5_wqe(txq, 1 << txq->wqe_n))
+ mpw.data.raw = (volatile void *)txq->wqes;
++mpw.pkts_n;
++j;
if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8f2cddb..b9b90a7 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -259,7 +259,7 @@ struct txq {
uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
uint32_t qp_num_8s; /* QP number shifted by 8. */
volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
- volatile struct mlx5_wqe64 (*wqes)[]; /* Work queue. */
+ volatile void *wqes; /* Work queue (use volatile to write into). */
volatile uint32_t *qp_db; /* Work queue doorbell. */
volatile uint32_t *cq_db; /* Completion queue doorbell. */
volatile void *bf_reg; /* Blueflame register. */
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 053665d..f4c6682 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -82,7 +82,9 @@ txq_alloc_elts(struct txq_ctrl *txq_ctrl, unsigned int elts_n)
for (i = 0; (i != elts_n); ++i)
(*txq_ctrl->txq.elts)[i] = NULL;
for (i = 0; (i != (1u << txq_ctrl->txq.wqe_n)); ++i) {
- volatile struct mlx5_wqe64 *wqe = &(*txq_ctrl->txq.wqes)[i];
+ volatile struct mlx5_wqe64 *wqe =
+ (volatile struct mlx5_wqe64 *)
+ txq_ctrl->txq.wqes + i;
memset((void *)(uintptr_t)wqe, 0x0, sizeof(*wqe));
}
@@ -214,9 +216,7 @@ txq_setup(struct txq_ctrl *tmpl, struct txq_ctrl *txq_ctrl)
}
tmpl->txq.cqe_n = log2above(ibcq->cqe);
tmpl->txq.qp_num_8s = qp->ctrl_seg.qp_num << 8;
- tmpl->txq.wqes =
- (volatile struct mlx5_wqe64 (*)[])
- (uintptr_t)qp->gen_data.sqstart;
+ tmpl->txq.wqes = qp->gen_data.sqstart;
tmpl->txq.wqe_n = log2above(qp->sq.wqe_cnt);
tmpl->txq.qp_db = &qp->gen_data.db[MLX5_SND_DBR];
tmpl->txq.bf_reg = qp->gen_data.bf->reg;
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread
* [dpdk-dev] [PATCH 3/7] net/mlx5: use vector types to speed up processing
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 1/7] net/mlx5: prepare Tx vectorization Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 2/7] net/mlx5: use work queue buffer as a raw buffer Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 4/7] net/mlx5: fix missing inline attributes Nelio Laranjeiro
` (4 subsequent siblings)
7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil
Let compiler automatically use the vector capabilities of the target
machine to optimize instructions.
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
drivers/net/mlx5/mlx5_prm.h | 7 +++++
drivers/net/mlx5/mlx5_rxtx.c | 74 +++++++++++++++++++++++---------------------
2 files changed, 46 insertions(+), 35 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 3dd4cbe..9cd9fdf 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -44,6 +44,7 @@
#pragma GCC diagnostic error "-Wpedantic"
#endif
+#include <rte_vect.h>
#include "mlx5_autoconf.h"
/* Get CQE owner bit. */
@@ -134,6 +135,12 @@ struct mlx5_wqe {
struct mlx5_wqe_eth_seg_small eseg;
};
+/* Vectorize WQE header. */
+struct mlx5_wqe_v {
+ rte_v128u32_t ctrl;
+ rte_v128u32_t eseg;
+};
+
/* WQE. */
struct mlx5_wqe64 {
struct mlx5_wqe hdr;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ada8e74..e161cd9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -371,7 +371,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int j = 0;
unsigned int max;
unsigned int comp;
- volatile struct mlx5_wqe *wqe = NULL;
+ volatile struct mlx5_wqe_v *wqe = NULL;
unsigned int segs_n = 0;
struct rte_mbuf *buf = NULL;
uint8_t *raw;
@@ -388,12 +388,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (max > elts_n)
max -= elts_n;
do {
- volatile struct mlx5_wqe_data_seg *dseg = NULL;
+ volatile rte_v128u32_t *dseg = NULL;
uint32_t length;
unsigned int ds = 0;
uintptr_t addr;
uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
uint8_t ehdr[2];
+ uint8_t cs_flags = 0;
#ifdef MLX5_PMD_SOFT_COUNTERS
uint32_t total_length = 0;
#endif
@@ -412,7 +413,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
--segs_n;
if (!segs_n)
--pkts_n;
- wqe = (volatile struct mlx5_wqe *)
+ wqe = (volatile struct mlx5_wqe_v *)
tx_mlx5_wqe(txq, txq->wqe_ci);
rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
if (pkts_n > 1)
@@ -438,11 +439,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Should we enable HW CKSUM offload */
if (buf->ol_flags &
(PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
- wqe->eseg.cs_flags =
- MLX5_ETH_WQE_L3_CSUM |
- MLX5_ETH_WQE_L4_CSUM;
- } else {
- wqe->eseg.cs_flags = 0;
+ cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
}
raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
/*
@@ -498,12 +495,11 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
*/
ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
if (length > 0) {
- dseg = (volatile struct mlx5_wqe_data_seg *)
+ dseg = (volatile rte_v128u32_t *)
((uintptr_t)wqe +
(ds * MLX5_WQE_DWORD_SIZE));
if ((uintptr_t)dseg >= end)
- dseg = (volatile struct
- mlx5_wqe_data_seg *)
+ dseg = (volatile rte_v128u32_t *)
txq->wqes;
goto use_dseg;
} else if (!segs_n) {
@@ -516,16 +512,17 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
* No inline has been done in the packet, only the
* Ethernet Header as been stored.
*/
- wqe->eseg.inline_hdr_sz = htons(MLX5_WQE_DWORD_SIZE);
- dseg = (volatile struct mlx5_wqe_data_seg *)
+ dseg = (volatile rte_v128u32_t *)
((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
ds = 3;
use_dseg:
/* Add the remaining packet as a simple ds. */
- *dseg = (volatile struct mlx5_wqe_data_seg) {
- .addr = htonll(addr),
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ addr = htonll(addr);
+ *dseg = (rte_v128u32_t){
+ htonl(length),
+ txq_mp2mr(txq, txq_mb2mp(buf)),
+ addr,
+ addr >> 32,
};
++ds;
if (!segs_n)
@@ -545,7 +542,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
((1 << txq->wqe_n) - 1);
- dseg = (volatile struct mlx5_wqe_data_seg *)
+ dseg = (volatile rte_v128u32_t *)
tx_mlx5_wqe(txq, n);
rte_prefetch0(tx_mlx5_wqe(txq, n + 1));
} else {
@@ -559,10 +556,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
total_length += length;
#endif
/* Store segment information. */
- *dseg = (volatile struct mlx5_wqe_data_seg) {
- .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
- .byte_count = htonl(length),
- .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+ addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ *dseg = (rte_v128u32_t){
+ htonl(length),
+ txq_mp2mr(txq, txq_mb2mp(buf)),
+ addr,
+ addr >> 32,
};
(*txq->elts)[elts_head] = buf;
elts_head = (elts_head + 1) & (elts_n - 1);
@@ -575,17 +574,19 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
next_pkt:
++i;
/* Initialize known and common part of the WQE structure. */
- wqe->ctrl[0] = htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND);
- wqe->ctrl[1] = htonl(txq->qp_num_8s | ds);
- wqe->ctrl[2] = 0;
- wqe->ctrl[3] = 0;
- wqe->eseg.rsvd0 = 0;
- wqe->eseg.rsvd1 = 0;
- wqe->eseg.mss = 0;
- wqe->eseg.rsvd2 = 0;
- wqe->eseg.inline_hdr_sz = htons(pkt_inline_sz);
- wqe->eseg.inline_hdr[0] = ehdr[0];
- wqe->eseg.inline_hdr[1] = ehdr[1];
+ wqe->ctrl = (rte_v128u32_t){
+ htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
+ htonl(txq->qp_num_8s | ds),
+ 0,
+ 0,
+ };
+ wqe->eseg = (rte_v128u32_t){
+ 0,
+ cs_flags,
+ 0,
+ (ehdr[1] << 24) | (ehdr[0] << 16) |
+ htons(pkt_inline_sz),
+ };
txq->wqe_ci += (ds + 3) / 4;
#ifdef MLX5_PMD_SOFT_COUNTERS
/* Increment sent bytes counter. */
@@ -598,10 +599,13 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
/* Check whether completion threshold has been reached. */
comp = txq->elts_comp + i + j;
if (comp >= MLX5_TX_COMP_THRESH) {
+ volatile struct mlx5_wqe_ctrl *w =
+ (volatile struct mlx5_wqe_ctrl *)wqe;
+
/* Request completion on last WQE. */
- wqe->ctrl[2] = htonl(8);
+ w->ctrl2 = htonl(8);
/* Save elts_head in unused "immediate" field of WQE. */
- wqe->ctrl[3] = elts_head;
+ w->ctrl3 = elts_head;
txq->elts_comp = 0;
} else {
txq->elts_comp = comp;
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread
* [dpdk-dev] [PATCH 4/7] net/mlx5: fix missing inline attributes
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
` (2 preceding siblings ...)
2016-11-24 16:03 ` [dpdk-dev] [PATCH 3/7] net/mlx5: use vector types to speed up processing Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 5/7] net/mlx5: move static prototype Nelio Laranjeiro
` (3 subsequent siblings)
7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil, stable
These functions must be forced inline for better performance.
Fixes: 99c12dcca65d ("net/mlx5: handle Rx CQE compression")
Fixes: 1d88ba171942 ("net/mlx5: refactor Tx data path")
Fixes: 67fa62bc672d ("mlx5: support checksum offload")
CC: stable@dpdk.org
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 34 +++++++++++++++++++++++++---------
1 file changed, 25 insertions(+), 9 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e161cd9..52733da 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -71,6 +71,31 @@
#include "mlx5_defs.h"
#include "mlx5_prm.h"
+static inline int
+check_cqe(volatile struct mlx5_cqe *cqe,
+ unsigned int cqes_n, const uint16_t ci)
+ __attribute__((always_inline));
+
+static inline uint32_t
+txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+ __attribute__((always_inline));
+
+static inline void
+mlx5_tx_dbrec(struct txq *txq) __attribute__((always_inline));
+
+static inline uint32_t
+rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
+ __attribute__((always_inline));
+
+static inline int
+mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
+ uint16_t cqe_cnt, uint32_t *rss_hash)
+ __attribute__((always_inline));
+
+static inline uint32_t
+rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
+ __attribute__((always_inline));
+
#ifndef NDEBUG
/**
@@ -100,11 +125,6 @@ check_cqe_seen(volatile struct mlx5_cqe *cqe)
#endif /* NDEBUG */
-static inline int
-check_cqe(volatile struct mlx5_cqe *cqe,
- unsigned int cqes_n, const uint16_t ci)
- __attribute__((always_inline));
-
/**
* Check whether CQE is valid.
*
@@ -266,10 +286,6 @@ txq_mb2mp(struct rte_mbuf *buf)
return buf->pool;
}
-static inline uint32_t
-txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
- __attribute__((always_inline));
-
/**
* Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
* Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread
* [dpdk-dev] [PATCH 5/7] net/mlx5: move static prototype
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
` (3 preceding siblings ...)
2016-11-24 16:03 ` [dpdk-dev] [PATCH 4/7] net/mlx5: fix missing inline attributes Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 6/7] net/mlx5: optimize copy of Ethernet header Nelio Laranjeiro
` (2 subsequent siblings)
7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil
Gather function prototypes at the beginning of the file.
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 52733da..0d0b807 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -76,6 +76,9 @@ check_cqe(volatile struct mlx5_cqe *cqe,
unsigned int cqes_n, const uint16_t ci)
__attribute__((always_inline));
+static inline void
+txq_complete(struct txq *txq) __attribute__((always_inline));
+
static inline uint32_t
txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
__attribute__((always_inline));
@@ -192,9 +195,6 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci)
return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
}
-static inline void
-txq_complete(struct txq *txq) __attribute__((always_inline));
-
/**
* Manage TX completions.
*
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread
* [dpdk-dev] [PATCH 6/7] net/mlx5: optimize copy of Ethernet header
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
` (4 preceding siblings ...)
2016-11-24 16:03 ` [dpdk-dev] [PATCH 5/7] net/mlx5: move static prototype Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
2016-11-24 16:03 ` [dpdk-dev] [PATCH 7/7] net/mlx5: remove inefficient prefetching Nelio Laranjeiro
2017-01-05 14:13 ` [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Ferruh Yigit
7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil
Use fewer instructions to copy the first two bytes of Ethernet headers to
work queue elements.
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 0d0b807..4b8c197 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -409,7 +409,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
unsigned int ds = 0;
uintptr_t addr;
uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
- uint8_t ehdr[2];
+ uint16_t ehdr;
uint8_t cs_flags = 0;
#ifdef MLX5_PMD_SOFT_COUNTERS
uint32_t total_length = 0;
@@ -436,8 +436,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
rte_prefetch0(*pkts);
addr = rte_pktmbuf_mtod(buf, uintptr_t);
length = DATA_LEN(buf);
- ehdr[0] = ((uint8_t *)addr)[0];
- ehdr[1] = ((uint8_t *)addr)[1];
+ ehdr = (((uint8_t *)addr)[1] << 8) |
+ ((uint8_t *)addr)[0];
#ifdef MLX5_PMD_SOFT_COUNTERS
total_length = length;
#endif
@@ -600,8 +600,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
0,
cs_flags,
0,
- (ehdr[1] << 24) | (ehdr[0] << 16) |
- htons(pkt_inline_sz),
+ (ehdr << 16) | htons(pkt_inline_sz),
};
txq->wqe_ci += (ds + 3) / 4;
#ifdef MLX5_PMD_SOFT_COUNTERS
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread
* [dpdk-dev] [PATCH 7/7] net/mlx5: remove inefficient prefetching
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
` (5 preceding siblings ...)
2016-11-24 16:03 ` [dpdk-dev] [PATCH 6/7] net/mlx5: optimize copy of Ethernet header Nelio Laranjeiro
@ 2016-11-24 16:03 ` Nelio Laranjeiro
2017-01-05 14:13 ` [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Ferruh Yigit
7 siblings, 0 replies; 13+ messages in thread
From: Nelio Laranjeiro @ 2016-11-24 16:03 UTC (permalink / raw)
To: dev; +Cc: Thomas Monjalon, Adrien Mazarguil
Prefetching completion queue entries is inefficient because too few CPU
cycles are spent before their use, which results into cache misses anyway.
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 21 ---------------------
1 file changed, 21 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 4b8c197..9f74fd4 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -348,23 +348,6 @@ mlx5_tx_dbrec(struct txq *txq)
}
/**
- * Prefetch a CQE.
- *
- * @param txq
- * Pointer to TX queue structure.
- * @param cqe_ci
- * CQE consumer index.
- */
-static inline void
-tx_prefetch_cqe(struct txq *txq, uint16_t ci)
-{
- volatile struct mlx5_cqe *cqe;
-
- cqe = &(*txq->cqes)[ci & ((1 << txq->cqe_n) - 1)];
- rte_prefetch0(cqe);
-}
-
-/**
* DPDK callback for TX.
*
* @param dpdk_txq
@@ -395,8 +378,6 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(!pkts_n))
return 0;
/* Prefetch first packet cacheline. */
- tx_prefetch_cqe(txq, txq->cq_ci);
- tx_prefetch_cqe(txq, txq->cq_ci + 1);
rte_prefetch0(*pkts);
/* Start processing. */
txq_complete(txq);
@@ -733,7 +714,6 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
if (unlikely(!pkts_n))
return 0;
/* Prefetch first packet cacheline. */
- tx_prefetch_cqe(txq, txq->cq_ci);
rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
/* Start processing. */
@@ -938,7 +918,6 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
if (unlikely(!pkts_n))
return 0;
/* Prefetch first packet cacheline. */
- tx_prefetch_cqe(txq, txq->cq_ci);
rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
/* Start processing. */
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance
2016-11-24 16:03 [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Nelio Laranjeiro
` (6 preceding siblings ...)
2016-11-24 16:03 ` [dpdk-dev] [PATCH 7/7] net/mlx5: remove inefficient prefetching Nelio Laranjeiro
@ 2017-01-05 14:13 ` Ferruh Yigit
2017-01-18 7:20 ` Thomas Monjalon
7 siblings, 1 reply; 13+ messages in thread
From: Ferruh Yigit @ 2017-01-05 14:13 UTC (permalink / raw)
To: Nelio Laranjeiro, dev; +Cc: Thomas Monjalon, Adrien Mazarguil
On 11/24/2016 4:03 PM, Nelio Laranjeiro wrote:
> This series applies on top of
> "[PATCH] eal: define generic vector types" [1][2]
>
> Using built-in vector types forces compilers to consider SIMD instructions in
> specific places in order to improve performance on both IBM POWER8 and Intel
> architectures.
>
> For example, testpmd single-thread I/O forwarding packets per second
> performance is improved by 6% on Intel platforms.
>
> [1] http://dpdk.org/ml/archives/dev/2016-November/050261.html
> [2] http://dpdk.org/dev/patchwork/patch/17024/
>
> Nelio Laranjeiro (7):
> net/mlx5: prepare Tx vectorization
> net/mlx5: use work queue buffer as a raw buffer
> net/mlx5: use vector types to speed up processing
> net/mlx5: fix missing inline attributes
> net/mlx5: move static prototype
> net/mlx5: optimize copy of Ethernet header
> net/mlx5: remove inefficient prefetching
>
<...>
Series applied to dpdk-next-net/master, thanks.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance
2017-01-05 14:13 ` [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance Ferruh Yigit
@ 2017-01-18 7:20 ` Thomas Monjalon
2017-01-18 9:23 ` Nélio Laranjeiro
2017-01-18 10:01 ` [dpdk-dev] [PATCH] net/mlx5: fix 32bits compilation issue Nelio Laranjeiro
0 siblings, 2 replies; 13+ messages in thread
From: Thomas Monjalon @ 2017-01-18 7:20 UTC (permalink / raw)
To: Ferruh Yigit, Nelio Laranjeiro; +Cc: dev, Adrien Mazarguil
2017-01-05 14:13, Ferruh Yigit:
> On 11/24/2016 4:03 PM, Nelio Laranjeiro wrote:
> > This series applies on top of
> > "[PATCH] eal: define generic vector types" [1][2]
> >
> > Using built-in vector types forces compilers to consider SIMD instructions in
> > specific places in order to improve performance on both IBM POWER8 and Intel
> > architectures.
> >
> > For example, testpmd single-thread I/O forwarding packets per second
> > performance is improved by 6% on Intel platforms.
> >
> > [1] http://dpdk.org/ml/archives/dev/2016-November/050261.html
> > [2] http://dpdk.org/dev/patchwork/patch/17024/
> >
> > Nelio Laranjeiro (7):
> > net/mlx5: prepare Tx vectorization
> > net/mlx5: use work queue buffer as a raw buffer
> > net/mlx5: use vector types to speed up processing
> > net/mlx5: fix missing inline attributes
> > net/mlx5: move static prototype
> > net/mlx5: optimize copy of Ethernet header
> > net/mlx5: remove inefficient prefetching
>
> Series applied to dpdk-next-net/master, thanks.
It will not be pulled in mainline because compilation fails on 32-bit:
drivers/net/mlx5/mlx5_rxtx.c: In function ‘mlx5_tx_burst’:
drivers/net/mlx5/mlx5_rxtx.c:523:10: error:
right shift count >= width of type [-Werror=shift-count-overflow]
addr >> 32,
^~
Please Ferruh, remove the series from next-net.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [dpdk-dev] [PATCH 0/7] net/mlx5: improve single core performance
2017-01-18 7:20 ` Thomas Monjalon
@ 2017-01-18 9:23 ` Nélio Laranjeiro
2017-01-18 10:01 ` [dpdk-dev] [PATCH] net/mlx5: fix 32bits compilation issue Nelio Laranjeiro
1 sibling, 0 replies; 13+ messages in thread
From: Nélio Laranjeiro @ 2017-01-18 9:23 UTC (permalink / raw)
To: Thomas Monjalon; +Cc: Ferruh Yigit, dev, Adrien Mazarguil
On Wed, Jan 18, 2017 at 08:20:41AM +0100, Thomas Monjalon wrote:
> 2017-01-05 14:13, Ferruh Yigit:
> > On 11/24/2016 4:03 PM, Nelio Laranjeiro wrote:
> > > This series applies on top of
> > > "[PATCH] eal: define generic vector types" [1][2]
> > >
> > > Using built-in vector types forces compilers to consider SIMD instructions in
> > > specific places in order to improve performance on both IBM POWER8 and Intel
> > > architectures.
> > >
> > > For example, testpmd single-thread I/O forwarding packets per second
> > > performance is improved by 6% on Intel platforms.
> > >
> > > [1] http://dpdk.org/ml/archives/dev/2016-November/050261.html
> > > [2] http://dpdk.org/dev/patchwork/patch/17024/
> > >
> > > Nelio Laranjeiro (7):
> > > net/mlx5: prepare Tx vectorization
> > > net/mlx5: use work queue buffer as a raw buffer
> > > net/mlx5: use vector types to speed up processing
> > > net/mlx5: fix missing inline attributes
> > > net/mlx5: move static prototype
> > > net/mlx5: optimize copy of Ethernet header
> > > net/mlx5: remove inefficient prefetching
> >
> > Series applied to dpdk-next-net/master, thanks.
>
> It will not be pulled in mainline because compilation fails on 32-bit:
>
> drivers/net/mlx5/mlx5_rxtx.c: In function ‘mlx5_tx_burst’:
> drivers/net/mlx5/mlx5_rxtx.c:523:10: error:
> right shift count >= width of type [-Werror=shift-count-overflow]
> addr >> 32,
> ^~
>
> Please Ferruh, remove the series from next-net.
Hi Thomas,
Wait, I'll submit a fix in few minutes.
Regards,
--
Nélio Laranjeiro
6WIND
^ permalink raw reply [flat|nested] 13+ messages in thread
* [dpdk-dev] [PATCH] net/mlx5: fix 32bits compilation issue
2017-01-18 7:20 ` Thomas Monjalon
2017-01-18 9:23 ` Nélio Laranjeiro
@ 2017-01-18 10:01 ` Nelio Laranjeiro
2017-01-18 10:13 ` Thomas Monjalon
1 sibling, 1 reply; 13+ messages in thread
From: Nelio Laranjeiro @ 2017-01-18 10:01 UTC (permalink / raw)
To: Thomas Monjalon, dev; +Cc: Adrien Mazarguil, Ferruh Yigit
Fixes: 02bb06aca20f ("net/mlx5: use vector types to speed up processing")
Reported-by: Thomas Monjalon <thomas.monjalon@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
Please squash it in the original patch if possible.
---
drivers/net/mlx5/mlx5_rxtx.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 8cf68c5..0177428 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -390,6 +390,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint32_t length;
unsigned int ds = 0;
uintptr_t addr;
+ uint64_t naddr;
uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE;
uint8_t ehdr[2];
uint8_t cs_flags = 0;
@@ -515,12 +516,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
ds = 3;
use_dseg:
/* Add the remaining packet as a simple ds. */
- addr = htonll(addr);
+ naddr = htonll(addr);
*dseg = (rte_v128u32_t){
htonl(length),
txq_mp2mr(txq, txq_mb2mp(buf)),
- addr,
- addr >> 32,
+ naddr,
+ naddr >> 32,
};
++ds;
if (!segs_n)
@@ -554,12 +555,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
total_length += length;
#endif
/* Store segment information. */
- addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
+ naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
*dseg = (rte_v128u32_t){
htonl(length),
txq_mp2mr(txq, txq_mb2mp(buf)),
- addr,
- addr >> 32,
+ naddr,
+ naddr >> 32,
};
(*txq->elts)[elts_head] = buf;
elts_head = (elts_head + 1) & (elts_n - 1);
--
2.1.4
^ permalink raw reply [flat|nested] 13+ messages in thread