From: Yongseok Koh <yskoh@mellanox.com>
To: adrien.mazarguil@6wind.com, nelio.laranjeiro@6wind.com,
bruce.richardson@intel.com, konstantin.ananyev@intel.com,
chaozhu@linux.vnet.ibm.com, jerin.jacob@caviumnetworks.com,
jianbo.liu@arm.com
Cc: arybchenko@solarflare.com, dev@dpdk.org,
Yongseok Koh <yskoh@mellanox.com>,
stable@dpdk.org
Subject: [dpdk-dev] [PATCH v4 9/9] net/mlx5: fix synchronization on polling Rx completions
Date: Thu, 25 Jan 2018 13:02:50 -0800 [thread overview]
Message-ID: <20180125210250.38233-10-yskoh@mellanox.com> (raw)
In-Reply-To: <20180125210250.38233-1-yskoh@mellanox.com>
Polling a new packet is basically sensing the generation bit in a
completion entry. For some processors not having strongly-ordered memory
model, there has to be a memory barrier between reading the generation bit
and other fields of the entry in order to guarantee data is not stale.
Fixes: 570acdb1da8a ("net/mlx5: add vectorized Rx/Tx burst for ARM")
Cc: stable@dpdk.org
Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
Acked-by: Shahaf Shuler <shahafs@mellanox.com>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
drivers/net/mlx5/mlx5_rxtx.c | 1 +
drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 53 ++++++++++++++++++++---------------
drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 2 +-
3 files changed, 32 insertions(+), 24 deletions(-)
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7a24d671d..8e46361d7 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1669,6 +1669,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
return 0;
++rxq->cq_ci;
op_own = cqe->op_own;
+ rte_cio_rmb();
if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
volatile struct mlx5_mini_cqe8 (*mc)[8] =
(volatile struct mlx5_mini_cqe8 (*)[8])
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index e11565f69..29ecedada 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -814,6 +814,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
uint16x4_t mask;
uint16x4_t byte_cnt;
uint32x4_t ptype_info, flow_tag;
+ register uint64x2_t c0, c1, c2, c3;
uint8_t *p0, *p1, *p2, *p3;
uint8_t *e0 = (void *)&elts[pos]->pkt_len;
uint8_t *e1 = (void *)&elts[pos + 1]->pkt_len;
@@ -830,6 +831,16 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+ /* B.0 (CQE 3) load a block having op_own. */
+ c3 = vld1q_u64((uint64_t *)(p3 + 48));
+ /* B.0 (CQE 2) load a block having op_own. */
+ c2 = vld1q_u64((uint64_t *)(p2 + 48));
+ /* B.0 (CQE 1) load a block having op_own. */
+ c1 = vld1q_u64((uint64_t *)(p1 + 48));
+ /* B.0 (CQE 0) load a block having op_own. */
+ c0 = vld1q_u64((uint64_t *)(p0 + 48));
+ /* Synchronize for loading the rest of blocks. */
+ rte_cio_rmb();
/* Prefetch next 4 CQEs. */
if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
@@ -839,50 +850,46 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
rte_prefetch_non_temporal(&cq[next + 3]);
}
__asm__ volatile (
- /* B.1 (CQE 3) load a block having op_own. */
- "ld1 {v19.16b}, [%[p3]] \n\t"
- "sub %[p3], %[p3], #48 \n\t"
- /* B.2 (CQE 3) load the rest blocks. */
+ /* B.1 (CQE 3) load the rest of blocks. */
"ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+ /* B.2 (CQE 3) move the block having op_own. */
+ "mov v19.16b, %[c3].16b \n\t"
/* B.3 (CQE 3) extract 16B fields. */
"tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 2) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
/* B.4 (CQE 3) adjust CRC length. */
"sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
- /* B.1 (CQE 2) load a block having op_own. */
- "ld1 {v19.16b}, [%[p2]] \n\t"
- "sub %[p2], %[p2], #48 \n\t"
/* C.1 (CQE 3) generate final structure for mbuf. */
"tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
- /* B.2 (CQE 2) load the rest blocks. */
- "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+ /* B.2 (CQE 2) move the block having op_own. */
+ "mov v19.16b, %[c2].16b \n\t"
/* B.3 (CQE 2) extract 16B fields. */
"tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 1) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
/* B.4 (CQE 2) adjust CRC length. */
"sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
- /* B.1 (CQE 1) load a block having op_own. */
- "ld1 {v19.16b}, [%[p1]] \n\t"
- "sub %[p1], %[p1], #48 \n\t"
/* C.1 (CQE 2) generate final structure for mbuf. */
"tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
- /* B.2 (CQE 1) load the rest blocks. */
- "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+ /* B.2 (CQE 1) move the block having op_own. */
+ "mov v19.16b, %[c1].16b \n\t"
/* B.3 (CQE 1) extract 16B fields. */
"tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 0) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
/* B.4 (CQE 1) adjust CRC length. */
"sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
- /* B.1 (CQE 0) load a block having op_own. */
- "ld1 {v19.16b}, [%[p0]] \n\t"
- "sub %[p0], %[p0], #48 \n\t"
/* C.1 (CQE 1) generate final structure for mbuf. */
"tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
- /* B.2 (CQE 0) load the rest blocks. */
- "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+ /* B.2 (CQE 0) move the block having op_own. */
+ "mov v19.16b, %[c0].16b \n\t"
+ /* A.1 load mbuf pointers. */
+ "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
/* B.3 (CQE 0) extract 16B fields. */
"tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
/* B.4 (CQE 0) adjust CRC length. */
"sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
- /* A.1 load mbuf pointers. */
- "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
/* D.1 extract op_own byte. */
"tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
/* C.2 (CQE 3) adjust flow mark. */
@@ -917,9 +924,9 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
[byte_cnt]"=&w"(byte_cnt),
[ptype_info]"=&w"(ptype_info),
[flow_tag]"=&w"(flow_tag)
- :[p3]"r"(p3 + 48), [p2]"r"(p2 + 48),
- [p1]"r"(p1 + 48), [p0]"r"(p0 + 48),
+ :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
[e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+ [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
[elts_p]"r"(elts_p),
[pkts_p]"r"(pkts_p),
[cqe_shuf_m]"w"(cqe_shuf_m),
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 559b0237e..df66c2fbd 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -833,7 +833,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
/* B.2 copy mbuf pointers. */
_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
- rte_compiler_barrier();
+ rte_cio_rmb();
/* C.1 load remained CQE data and extract necessary fields. */
cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
--
2.11.0
next prev parent reply other threads:[~2018-01-25 21:03 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-12-27 4:28 [dpdk-dev] [PATCH 1/2] eal/arm64: modify I/O device memory barriers Yongseok Koh
2017-12-27 4:28 ` [dpdk-dev] [PATCH 2/2] net/mlx5: fix synchonization on polling Rx completions Yongseok Koh
2018-01-04 12:58 ` [dpdk-dev] [PATCH 1/2] eal/arm64: modify I/O device memory barriers Jerin Jacob
2018-01-08 1:55 ` Jianbo Liu
2018-01-16 0:42 ` Yongseok Koh
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 0/8] introduce DMA " Yongseok Koh
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 1/8] eal: " Yongseok Koh
2018-01-16 2:47 ` Jianbo Liu
2018-01-16 7:49 ` Andrew Rybchenko
2018-01-16 9:10 ` Jianbo Liu
2018-01-17 13:46 ` Thomas Monjalon
2018-01-17 18:39 ` Yongseok Koh
2018-01-18 11:56 ` Andrew Rybchenko
2018-01-18 18:14 ` Yongseok Koh
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 2/8] eal/x86: define " Yongseok Koh
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 3/8] eal/ppc64: define DMA device " Yongseok Koh
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 4/8] eal/armv7: define DMA " Yongseok Koh
2018-01-16 2:48 ` Jianbo Liu
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 5/8] eal/arm64: " Yongseok Koh
2018-01-16 2:50 ` Jianbo Liu
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: remove unnecessary memory barrier Yongseok Koh
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: replace IO memory barrier with DMA " Yongseok Koh
2018-01-16 1:10 ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: fix synchonization on polling Rx completions Yongseok Koh
2018-01-16 3:53 ` Jianbo Liu
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 0/8] introduce DMA memory barriers Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 1/8] eal: " Yongseok Koh
2018-01-19 7:16 ` Andrew Rybchenko
2018-01-22 18:29 ` Yongseok Koh
2018-01-22 20:59 ` Thomas Monjalon
2018-01-23 4:35 ` Jerin Jacob
2018-01-25 19:08 ` Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 2/8] eal/x86: define " Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 3/8] eal/ppc64: " Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 4/8] eal/armv7: " Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 5/8] eal/arm64: " Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 6/8] net/mlx5: remove unnecessary memory barrier Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 7/8] net/mlx5: replace IO memory barrier with DMA " Yongseok Koh
2018-01-19 0:44 ` [dpdk-dev] [PATCH v3 8/8] net/mlx5: fix synchonization on polling Rx completions Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 0/9] introduce coherent I/O memory barriers Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 1/9] eal: add Doxygen grouping for " Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 2/9] eal: introduce coherent I/O " Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 3/9] eal/x86: define " Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 4/9] eal/ppc64: " Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 5/9] eal/armv7: " Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 6/9] eal/arm64: " Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 7/9] net/mlx5: remove unnecessary memory barrier Yongseok Koh
2018-01-25 21:02 ` [dpdk-dev] [PATCH v4 8/9] net/mlx5: replace I/O memory barrier with coherent version Yongseok Koh
2018-01-25 21:02 ` Yongseok Koh [this message]
2018-01-28 7:32 ` [dpdk-dev] [PATCH v4 0/9] introduce coherent I/O memory barriers Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180125210250.38233-10-yskoh@mellanox.com \
--to=yskoh@mellanox.com \
--cc=adrien.mazarguil@6wind.com \
--cc=arybchenko@solarflare.com \
--cc=bruce.richardson@intel.com \
--cc=chaozhu@linux.vnet.ibm.com \
--cc=dev@dpdk.org \
--cc=jerin.jacob@caviumnetworks.com \
--cc=jianbo.liu@arm.com \
--cc=konstantin.ananyev@intel.com \
--cc=nelio.laranjeiro@6wind.com \
--cc=stable@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).