From: "Nélio Laranjeiro" <nelio.laranjeiro@6wind.com>
To: Yongseok Koh <yskoh@mellanox.com>
Cc: ferruh.yigit@intel.com, dev@dpdk.org, adrien.mazarguil@6wind.com
Subject: Re: [dpdk-dev] [PATCH 1/2] net/mlx5: add enhanced multi-packet send for ConnectX-5
Date: Wed, 15 Mar 2017 11:09:48 +0100 [thread overview]
Message-ID: <20170315100948.GB22756@autoinstall.dev.6wind.com> (raw)
In-Reply-To: <20170301050225.27164-2-yskoh@mellanox.com>
Hi Yongseok,
Please see the comments below,
On Tue, Feb 28, 2017 at 09:02:24PM -0800, Yongseok Koh wrote:
> ConnectX-5 supports enhanced version of multi-packet send (MPS). An MPS Tx
> descriptor can carry multiple packets either by including pointers of
> packets or by inlining packets. Inlining packet data can be helpful to
> better utilize PCIe bandwidth. In addition, Enhanced MPS supports hybrid
> mode - mixing inlined packets and pointers in a descriptor. This feature is
> enabled by default if supported by HW.
>
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> ---
> drivers/net/mlx5/mlx5.c | 34 +++-
> drivers/net/mlx5/mlx5.h | 4 +-
> drivers/net/mlx5/mlx5_ethdev.c | 6 +-
> drivers/net/mlx5/mlx5_prm.h | 23 +++
> drivers/net/mlx5/mlx5_rxtx.c | 405 +++++++++++++++++++++++++++++++++++++++++
> drivers/net/mlx5/mlx5_rxtx.h | 5 +
> drivers/net/mlx5/mlx5_txq.c | 18 +-
> 7 files changed, 486 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index d4bd4696c..24e3865f0 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -84,6 +84,12 @@
> /* Device parameter to enable multi-packet send WQEs. */
> #define MLX5_TXQ_MPW_EN "txq_mpw_en"
>
> +/* Device parameter to configure the number of dsegs before inlined packet. */
> +#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
> +
> +/* Device parameter to limit the size of inlining packet */
> +#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
> +
> /**
> * Retrieve integer value from environment variable.
> *
> @@ -289,7 +295,11 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
> } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
> priv->txqs_inline = tmp;
> } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
> - priv->mps &= !!tmp; /* Enable MPW only if HW supports */
> + priv->mps = !!tmp ? priv->mps : MLX5_MPW_DISABLED;
> + } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
> + priv->mpw_hdr_dseg = !!tmp;
> + } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
> + priv->txq_max_inline_len = tmp;
It is confusing to have a tri-state for an Enable feature, is it really
necessary to distinguish the enhanced from the non enhanced?
Is there a good reason to let the ConnectX-5 work in the non enhanced
mode?
> } else {
> WARN("%s: unknown parameter", key);
> return -EINVAL;
> @@ -316,6 +326,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
> MLX5_TXQ_INLINE,
> MLX5_TXQS_MIN_INLINE,
> MLX5_TXQ_MPW_EN,
> + MLX5_TXQ_MPW_HDR_DSEG_EN,
> + MLX5_TXQ_MAX_INLINE_LEN,
> NULL,
> };
> struct rte_kvargs *kvlist;
> @@ -424,20 +436,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
> */
> switch (pci_dev->id.device_id) {
> case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
> + mps = MLX5_MPW;
> + break;
> case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
> case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
> case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> - mps = 1;
> + mps = MLX5_MPW_ENHANCED;
> break;
> default:
> - mps = 0;
> + mps = MLX5_MPW_DISABLED;
> }
> INFO("PCI information matches, using device \"%s\""
> - " (SR-IOV: %s, MPS: %s)",
> + " (SR-IOV: %s, %sMPS: %s)",
> list[i]->name,
> sriov ? "true" : "false",
> - mps ? "true" : "false");
> + mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
> + mps != MLX5_MPW_DISABLED ? "true" : "false");
> attr_ctx = ibv_open_device(list[i]);
> err = errno;
> break;
> @@ -531,6 +546,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
> priv->pd = pd;
> priv->mtu = ETHER_MTU;
> priv->mps = mps; /* Enable MPW by default if supported. */
> + /* Set default values for Enhanced MPW, a.k.a MPWv2 */
> + if (mps == MLX5_MPW_ENHANCED) {
> + priv->mpw_hdr_dseg = 0;
> + priv->txqs_inline = MLX5_EMPW_MIN_TXQS;
> + priv->txq_max_inline_len = MLX5_EMPW_MAX_INLINE_LEN;
> + }
> priv->cqe_comp = 1; /* Enable compression by default. */
> err = mlx5_args(priv, pci_dev->device.devargs);
> if (err) {
> @@ -586,6 +607,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
> err = ENOTSUP;
> goto port_error;
> }
> + INFO("%sMPS is %s",
> + priv->mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
> + priv->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
> /* Allocate and register default RSS hash keys. */
> priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
> sizeof((*priv->rss_conf)[0]), 0);
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
> index 2b4345a69..4076eb4d5 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -123,11 +123,13 @@ struct priv {
> unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
> unsigned int hw_padding:1; /* End alignment padding is supported. */
> unsigned int sriov:1; /* This is a VF or PF with VF devices. */
> - unsigned int mps:1; /* Whether multi-packet send is supported. */
> + unsigned int mps:2; /* Multi-packet send mode (0: disabled). */
> + unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB */
> unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
> unsigned int pending_alarm:1; /* An alarm is pending. */
> unsigned int txq_inline; /* Maximum packet size for inlining. */
> unsigned int txqs_inline; /* Queue number threshold for inlining. */
> + unsigned int txq_max_inline_len; /* Max packet length for inlining */
txq_max_inline_len seems redundant with txq_inline where is does not
refer to the same thing. Can you please find a better name to help in
future maintainance of the code?
> /* RX/TX queues. */
> unsigned int rxqs_n; /* RX queues array size. */
> unsigned int txqs_n; /* TX queues array size. */
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
> index 5677f03c9..20d3563e4 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -1584,7 +1584,11 @@ priv_select_tx_function(struct priv *priv)
> {
> priv->dev->tx_pkt_burst = mlx5_tx_burst;
> /* Select appropriate TX function. */
> - if (priv->mps && priv->txq_inline) {
> + if (priv->mps == MLX5_MPW_ENHANCED) {
> + priv->dev->tx_pkt_burst =
> + mlx5_tx_burst_empw;
> + DEBUG("selected Enhanced MPW TX function");
> + } else if (priv->mps && priv->txq_inline) {
> priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
> DEBUG("selected MPW inline TX function");
> } else if (priv->mps) {
> diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> index 755b5d779..139a54f0d 100644
> --- a/drivers/net/mlx5/mlx5_prm.h
> +++ b/drivers/net/mlx5/mlx5_prm.h
> @@ -73,6 +73,9 @@
> /* WQE size */
> #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
>
> +/* Max size of a WQE session */
> +#define MLX5_WQE_SIZE_MAX 960U
> +
> /* Compute the number of DS. */
> #define MLX5_WQE_DS(n) \
> (((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
> @@ -80,10 +83,22 @@
> /* Room for inline data in multi-packet WQE. */
> #define MLX5_MWQE64_INL_DATA 28
>
> +/* Ratio(1/N) of inline quota in a Enhanced MPS WQE */
> +#define MLX5_EMPW_INL_QUOTA_DIV 1
> +
> +/* Default number of Tx queues for inlining packets */
> +#define MLX5_EMPW_MIN_TXQS 12
> +
> +/* Default max packet length to be inlined */
> +#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
> +
> #ifndef HAVE_VERBS_MLX5_OPCODE_TSO
> #define MLX5_OPCODE_TSO MLX5_OPCODE_LSO_MPW /* Compat with OFED 3.3. */
> #endif
>
> +#define MLX5_OPC_MOD_ENHANCED_MPSW 0
> +#define MLX5_OPCODE_ENHANCED_MPSW 0x29
> +
> /* CQE value to inform that VLAN is stripped. */
> #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
>
> @@ -170,10 +185,18 @@ struct mlx5_wqe64 {
> uint8_t raw[32];
> } __rte_aligned(MLX5_WQE_SIZE);
>
> +/* MPW mode */
> +enum mlx5_mpw_mode {
> + MLX5_MPW_DISABLED,
> + MLX5_MPW,
> + MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2 */
> +};
> +
> /* MPW session status. */
> enum mlx5_mpw_state {
> MLX5_MPW_STATE_OPENED,
> MLX5_MPW_INL_STATE_OPENED,
> + MLX5_MPW_ENHANCED_STATE_OPENED,
> MLX5_MPW_STATE_CLOSED,
> };
>
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index b2b722380..9fc3f5016 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -195,6 +195,62 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci)
> }
>
> /**
> + * Return the size of tailroom of WQ.
> + *
> + * @param txq
> + * Pointer to TX queue structure.
> + * @param addr
> + * Pointer to tail of WQ.
> + *
> + * @return
> + * Size of tailroom.
> + */
> +static inline size_t
> +tx_mlx5_wqe_tailroom(struct txq *txq, void *addr)
> +{
> + size_t tailroom;
> + tailroom = (uintptr_t)(txq->wqes) +
> + (1 << txq->wqe_n) * MLX5_WQE_SIZE -
> + (uintptr_t)addr;
> + return tailroom;
> +}
The name of the function is confusing, as it does not compute the tail
room in a Work Queue Element but the remaining size of the Work Queue in
bytes.
Is not there missing the __attribute__((always_inline)) ?
> +/**
> + * Copy data to tailroom of circular queue.
> + *
> + * @param dst
> + * Pointer to destination.
> + * @param src
> + * Pointer to source.
> + * @param n
> + * Number of bytes to copy.
> + * @param base
> + * Pointer to head of queue.
> + * @param tailroom
> + * Size of tailroom from dst.
> + *
> + * @return
> + * Pointer after copied data.
> + */
> +static inline void *
> +memcpy_to_tailroom(void *dst, const void *src, size_t n,
> + void *base, size_t tailroom)
This one is also confusing "tailroom" is generally used as the remaining
place after the packet in the buffer. Can you please specify a little
better this function name? Maybe something like
mlx5_copy_wq_tailroom().
By the way, the indentation is not correct.
> +{
> + void *ret;
> +
> + if (n > tailroom) {
> + rte_memcpy(dst, src, tailroom);
> + rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
> + n - tailroom);
The indentation here is also incorrect.
> + ret = (uint8_t *)base + n - tailroom;
> + } else {
> + rte_memcpy(dst, src, n);
> + ret = (n == tailroom) ? base : (uint8_t *)dst + n;
> + }
> + return ret;
> +}
> +
> +/**
> * Manage TX completions.
> *
> * When sending a burst, mlx5_tx_burst() posts several WRs.
> @@ -1155,6 +1211,355 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
> }
>
> /**
> + * Open an Enhanced MPW session.
> + *
> + * @param txq
> + * Pointer to TX queue structure.
> + * @param mpw
> + * Pointer to MPW session structure.
> + * @param length
> + * Packet length.
> + */
> +static inline void
> +mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
> +{
> + uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
> +
> + mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
> + mpw->pkts_n = 0;
> + mpw->total_len = sizeof(struct mlx5_wqe);
> + mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
> + mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
> + (txq->wqe_ci << 8) |
> + MLX5_OPCODE_ENHANCED_MPSW);
> + mpw->wqe->ctrl[2] = 0;
> + mpw->wqe->ctrl[3] = 0;
> + memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
> + if (unlikely(padding)) {
> + uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
> +
> + /* Pad the first 2 DWORDs with zero-length inline header */
> + *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
> + *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE)
> + = htonl(MLX5_INLINE_SEG);
> + mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
> + /* Start from the next WQEBB */
> + mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
> + } else {
> + mpw->data.raw = (volatile void *)(mpw->wqe + 1);
> + }
> +}
> +
> +/**
> + * Close an Enhanced MPW session.
> + *
> + * @param txq
> + * Pointer to TX queue structure.
> + * @param mpw
> + * Pointer to MPW session structure.
> + *
> + * @return
> + * Number of consumed WQEs.
> + */
> +static inline uint16_t
> +mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
> +{
> + uint16_t ret;
> +
> + /* Store size in multiple of 16 bytes. Control and Ethernet segments
> + * count as 2.
> + */
> + mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
> + mpw->state = MLX5_MPW_STATE_CLOSED;
> + ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
> + txq->wqe_ci += ret;
> + return ret;
> +}
> +
> +/**
> + * DPDK callback for TX with Enhanced MPW support.
> + *
> + * @param dpdk_txq
> + * Generic pointer to TX queue structure.
> + * @param[in] pkts
> + * Packets to transmit.
> + * @param pkts_n
> + * Number of packets in array.
> + *
> + * @return
> + * Number of packets successfully transmitted (<= pkts_n).
> + */
> +uint16_t
> +mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
> +{
> + struct txq *txq = (struct txq *)dpdk_txq;
> + uint16_t elts_head = txq->elts_head;
> + const unsigned int elts_n = 1 << txq->elts_n;
> + unsigned int i = 0;
> + unsigned int j = 0;
> + unsigned int max_elts;
> + unsigned int mpw_n = 0; /* the number of outstanding WQEs. */
> + uint16_t max_wqe, max_cqe;
> + unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
> + unsigned int mpw_room = 0;
> + unsigned int inl_pad = 0;
> + unsigned int inl_budget = 0;
> + uint32_t inl_hdr;
> + struct mlx5_mpw mpw = {
> + .state = MLX5_MPW_STATE_CLOSED,
> + };
> +
> + if (unlikely(!pkts_n))
> + return 0;
> + /* Start processing. */
> + txq_complete(txq);
> + max_elts = (elts_n - (elts_head - txq->elts_tail));
> + if (max_elts > elts_n)
> + max_elts -= elts_n;
> + max_cqe = (1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci);
> + /* One CQE slot is needed at the end */
> + if (!max_cqe)
> + return 0;
> + max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
> + if (unlikely(!max_wqe))
> + return 0;
> + do {
> + struct rte_mbuf *buf = *(pkts++);
> + unsigned int elts_head_next;
> + uintptr_t addr;
> + unsigned int n;
> + uint32_t length;
> + unsigned int segs_n = buf->nb_segs;
> + uint32_t cs_flags = 0;
> +
> + /*
> + * Make sure there is enough room to store this packet and
> + * that one ring entry remains unused.
> + */
> + assert(segs_n);
> + if (max_elts - j < segs_n + 1)
> + break;
> + /* Do not bother with large packets MPW cannot handle. */
> + if (segs_n > MLX5_MPW_DSEG_MAX)
> + break;
> + /* Should we enable HW CKSUM offload */
> + if (buf->ol_flags &
> + (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
> + cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
> + /* Retrieve packet information. */
> + length = PKT_LEN(buf);
> + /* Start new session if:
> + * - multi-segment packet
> + * - no space left even for a dseg
> + * - next packet can be inlined with a new WQE
> + * - cs_flag differs
> + * It can't be MLX5_MPW_STATE_OPENED as always have a single
> + * segmented packet.
> + */
> + if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
> + if ((segs_n != 1) ||
> + (inl_pad + sizeof(struct mlx5_wqe_data_seg) >
> + mpw_room) ||
> + (length <= txq->max_inline_len &&
> + (length > inl_budget ||
> + inl_pad + sizeof(inl_hdr) + length >
> + mpw_room)) ||
> + (mpw.wqe->eseg.cs_flags != cs_flags))
> + max_wqe -= mlx5_empw_close(txq, &mpw);
As there are already comments about names, I would also suggest to fix
indentation in the if statement, mpw_room is not aligned correctly in
both places it appears.
> + }
> + if (mpw.state == MLX5_MPW_STATE_CLOSED) {
> + if (segs_n != 1) {
> + /* Fall back to legacy MPW.
> + * A MPW session consumes 2 WQEs at most to
> + * include MLX5_MPW_DSEG_MAX pointers.
> + */
> + if (unlikely(max_wqe < 2))
> + break;
> + mlx5_mpw_new(txq, &mpw, length);
> + } else {
> + /* In Enhanced MPW, inline as much as the budget
> + * is * allowed. The remaining space is to be
> + * filled * with dsegs. If the title WQEBB isn't
> + * padded, it will have 2 dsegs there.
> + */
> + mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
> + (max_inline ? max_inline :
> + pkts_n * MLX5_WQE_DWORD_SIZE) +
> + MLX5_WQE_SIZE);
> + if (unlikely((max_wqe * MLX5_WQE_SIZE) <
> + mpw_room))
Same here.
> + break;
> + /* Do not pad the title WQEBB to not waste WQ */
> + mlx5_empw_new(txq, &mpw, 0);
> + mpw_room -= mpw.total_len;
> + inl_budget = max_inline ?
> + mpw_room / MLX5_EMPW_INL_QUOTA_DIV :
> + 0;
> + inl_pad = 0;
> + }
> + mpw.wqe->eseg.cs_flags = cs_flags;
> + ++mpw_n;
> + }
> + /* Multi-segment packets must be alone in their MPW. */
> + assert((segs_n == 1) || (mpw.pkts_n == 0));
> + if (mpw.state == MLX5_MPW_STATE_OPENED) {
> +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
> + length = 0;
> +#endif
> + do {
> + volatile struct mlx5_wqe_data_seg *dseg;
> +
> + elts_head_next =
> + (elts_head + 1) & (elts_n - 1);
> + assert(buf);
> + (*txq->elts)[elts_head] = buf;
> + dseg = mpw.data.dseg[mpw.pkts_n];
> + addr = rte_pktmbuf_mtod(buf, uintptr_t);
> + *dseg = (struct mlx5_wqe_data_seg){
> + .byte_count = htonl(DATA_LEN(buf)),
> + .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
> + .addr = htonll(addr),
> + };
> + elts_head = elts_head_next;
> +#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
> + length += DATA_LEN(buf);
> +#endif
> + buf = buf->next;
> + ++j;
> + ++mpw.pkts_n;
> + } while (--segs_n);
> + /* A multi-segmented packet takes one MPW session.
> + * TODO: Pack more multi-segmented packets if possible.
> + */
> + mlx5_mpw_close(txq, &mpw);
> + if (mpw.pkts_n < 3)
> + max_wqe--;
> + else
> + max_wqe -= 2;
> + } else if (length <= txq->max_inline_len &&
> + length <= inl_budget &&
> + (inl_pad + sizeof(inl_hdr) + length) <= mpw_room &&
> + (!txq->mpw_hdr_dseg ||
> + mpw.total_len >= MLX5_WQE_SIZE)) {
> + /* Inline packet into WQE */
> + unsigned int max;
> +
> + assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
> + assert(length == DATA_LEN(buf));
> + inl_hdr = htonl(length | MLX5_INLINE_SEG);
> + addr = rte_pktmbuf_mtod(buf, uintptr_t);
> + mpw.data.raw = (volatile void *)
> + ((uintptr_t)mpw.data.raw + inl_pad);
> + max = tx_mlx5_wqe_tailroom(txq,
> + (void *)(uintptr_t)mpw.data.raw);
> + /* Copy inline header */
> + mpw.data.raw = (volatile void *)
> + memcpy_to_tailroom(
> + (void *)(uintptr_t)mpw.data.raw,
> + &inl_hdr,
> + sizeof(inl_hdr),
> + (void *)(uintptr_t)txq->wqes,
> + max);
> + max = tx_mlx5_wqe_tailroom(txq,
> + (void *)(uintptr_t)mpw.data.raw);
> + /* Copy packet data */
> + mpw.data.raw = (volatile void *)
> + memcpy_to_tailroom(
> + (void *)(uintptr_t)mpw.data.raw,
> + (void *)addr,
> + length,
> + (void *)(uintptr_t)txq->wqes,
> + max);
> + ++mpw.pkts_n;
> + inl_budget -= length;
> + mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
> + /* No need to get completion as the entire packet is
> + * copied to WQ. Free the buf right away.
> + */
> + elts_head_next = elts_head;
> + rte_pktmbuf_free_seg(buf);
> + mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
> + /* Add pad in the next packet if any */
> + inl_pad = (((uintptr_t)mpw.data.raw +
> + (MLX5_WQE_DWORD_SIZE - 1)) &
> + ~(MLX5_WQE_DWORD_SIZE - 1)) -
> + (uintptr_t)mpw.data.raw;
> + } else {
> + /* No inline. Load a dseg of packet pointer */
> + volatile rte_v128u32_t *dseg;
> +
> + assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
> + assert((inl_pad + sizeof(*dseg)) <= mpw_room);
> + assert(length == DATA_LEN(buf));
> + if (!tx_mlx5_wqe_tailroom(txq,
> + (void *)((uintptr_t)mpw.data.raw
> + + inl_pad)))
> + dseg = (volatile void *)txq->wqes;
> + else
> + dseg = (volatile void *)
> + ((uintptr_t)mpw.data.raw +
> + inl_pad);
> + elts_head_next = (elts_head + 1) & (elts_n - 1);
> + (*txq->elts)[elts_head] = buf;
> + addr = rte_pktmbuf_mtod(buf, uintptr_t);
> + for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
> + rte_prefetch2((void *)(addr +
> + n * RTE_CACHE_LINE_SIZE));
> + addr = htonll(addr);
> + *dseg = (rte_v128u32_t) {
> + htonl(length),
> + txq_mp2mr(txq, txq_mb2mp(buf)),
> + addr,
> + addr >> 32,
> + };
> + mpw.data.raw = (volatile void *)(dseg + 1);
> + mpw.total_len += (inl_pad + sizeof(*dseg));
> + ++j;
> + ++mpw.pkts_n;
> + mpw_room -= (inl_pad + sizeof(*dseg));
> + inl_pad = 0;
> + }
> + elts_head = elts_head_next;
> +#ifdef MLX5_PMD_SOFT_COUNTERS
> + /* Increment sent bytes counter. */
> + txq->stats.obytes += length;
> +#endif
> + ++i;
> + } while (i < pkts_n);
> + /* Take a shortcut if nothing must be sent. */
> + if (unlikely(i == 0))
> + return 0;
> + /* Check whether completion threshold has been reached. */
> + if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
> + txq->mpw_comp + mpw_n >= MLX5_TX_COMP_THRESH) {
> + volatile struct mlx5_wqe *wqe = mpw.wqe;
> +
> + /* Request completion on last WQE. */
> + wqe->ctrl[2] = htonl(8);
> + /* Save elts_head in unused "immediate" field of WQE. */
> + wqe->ctrl[3] = elts_head;
> + txq->elts_comp = 0;
> + txq->mpw_comp = 0;
> + txq->cq_pi++;
> + } else {
> + txq->elts_comp += j;
> + txq->mpw_comp += mpw_n;
> + }
> +#ifdef MLX5_PMD_SOFT_COUNTERS
> + /* Increment sent packets counter. */
> + txq->stats.opackets += i;
> +#endif
> + if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
> + mlx5_empw_close(txq, &mpw);
> + else if (mpw.state == MLX5_MPW_STATE_OPENED)
> + mlx5_mpw_close(txq, &mpw);
> + /* Ring QP doorbell. */
> + mlx5_tx_dbrec(txq, mpw.wqe);
> + txq->elts_head = elts_head;
> + return i;
> +}
> +
> +/**
> * Translate RX completion flags to packet type.
> *
> * @param[in] cqe
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 41a34d7ff..e70d65465 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -247,13 +247,17 @@ struct txq {
> uint16_t elts_head; /* Current index in (*elts)[]. */
> uint16_t elts_tail; /* First element awaiting completion. */
> uint16_t elts_comp; /* Counter since last completion request. */
> + uint16_t mpw_comp; /* Counter of MPW since last completion request. */
> uint16_t cq_ci; /* Consumer index for completion queue. */
> + uint16_t cq_pi; /* Producer index for completion queue. */
> uint16_t wqe_ci; /* Consumer index for work queue. */
> uint16_t wqe_pi; /* Producer index for work queue. */
> uint16_t elts_n:4; /* (*elts)[] length (in log2). */
> uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
> uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
> + uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB */
> uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
> + uint16_t max_inline_len; /* Max packet length to inilne */
Small typo here (i.e. iniline instead of inline).
max_inline_len is confusing and seems to be redundant with max_inline
but they do not define the same thing. Please find an appropriate name
to help in future maintenance.
> uint32_t qp_num_8s; /* QP number shifted by 8. */
> volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
> volatile void *wqes; /* Work queue (use volatile to write into). */
> @@ -320,6 +324,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
> uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
> uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
> uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
> +uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t);
> uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
> uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
> uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index 949035bd4..ef8775382 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -276,6 +276,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
> (void)conf; /* Thresholds configuration (ignored). */
> assert(desc > MLX5_TX_COMP_THRESH);
> tmpl.txq.elts_n = log2above(desc);
> + if (priv->mps == MLX5_MPW_ENHANCED)
> + tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
> /* MRs will be registered in mp2mr[] later. */
> attr.rd = (struct ibv_exp_res_domain_init_attr){
> .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
> @@ -340,8 +342,20 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
> tmpl.txq.max_inline =
> ((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
> RTE_CACHE_LINE_SIZE);
> - attr.init.cap.max_inline_data =
> - tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
> + if (priv->mps == MLX5_MPW_ENHANCED) {
> + tmpl.txq.max_inline_len = priv->txq_max_inline_len;
> + /* To minimize the size of data set, avoid requesting
> + * too large WQ
> + */
> + attr.init.cap.max_inline_data =
> + ((RTE_MIN(priv->txq_inline,
> + priv->txq_max_inline_len) +
> + (RTE_CACHE_LINE_SIZE - 1)) /
> + RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
> + } else {
> + attr.init.cap.max_inline_data =
> + tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
> + }
> }
> tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
> if (tmpl.qp == NULL) {
> --
> 2.11.0
Great job,
Thanks,
--
Nélio Laranjeiro
6WIND
next prev parent reply other threads:[~2017-03-15 10:09 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-03-01 5:02 [dpdk-dev] [PATCH 0/2] " Yongseok Koh
2017-03-01 5:02 ` [dpdk-dev] [PATCH 1/2] " Yongseok Koh
2017-03-15 10:09 ` Nélio Laranjeiro [this message]
2017-03-01 5:02 ` [dpdk-dev] [PATCH 2/2] doc: update PMD options for mlx5 Yongseok Koh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170315100948.GB22756@autoinstall.dev.6wind.com \
--to=nelio.laranjeiro@6wind.com \
--cc=adrien.mazarguil@6wind.com \
--cc=dev@dpdk.org \
--cc=ferruh.yigit@intel.com \
--cc=yskoh@mellanox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).