DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing
@ 2020-07-10  9:48 Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 02/16] net/mlx5: introduce send scheduling devargs Viacheslav Ovsiienko
                   ` (14 more replies)
  0 siblings, 15 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

This patch prepares the common part of the mlx5 PMDs to
support packet send scheduling on mbuf timestamps:

  - the DevX routine to query the packet pacing HCA capabilities
  - packet pacing Send Queue attrubutes support
  - the hardware related definitions

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---

RFC:  http://patches.dpdk.org/patch/71078/
mbuf: http://patches.dpdk.org/patch/73643/

 drivers/common/mlx5/Makefile          | 20 ++++++++++
 drivers/common/mlx5/linux/meson.build |  8 ++++
 drivers/common/mlx5/linux/mlx5_glue.c | 31 +++++++++++++++-
 drivers/common/mlx5/linux/mlx5_glue.h |  5 +++
 drivers/common/mlx5/mlx5_devx_cmds.c  | 19 +++++++++-
 drivers/common/mlx5/mlx5_devx_cmds.h  | 10 +++++
 drivers/common/mlx5/mlx5_prm.h        | 69 ++++++++++++++++++++++++++++++++---
 7 files changed, 154 insertions(+), 8 deletions(-)

diff --git a/drivers/common/mlx5/Makefile b/drivers/common/mlx5/Makefile
index f6c762b..de03a40 100644
--- a/drivers/common/mlx5/Makefile
+++ b/drivers/common/mlx5/Makefile
@@ -172,6 +172,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		func mlx5dv_devx_qp_query \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_MLX5DV_PP_ALLOC \
+		infiniband/mlx5dv.h \
+		func mlx5dv_pp_alloc \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR \
 		infiniband/mlx5dv.h \
 		func mlx5dv_dr_action_create_dest_devx_tir \
@@ -207,6 +212,21 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		func mlx5dv_dr_domain_set_reclaim_device_memory \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_MLX5_OPCODE_ENHANCED_MPSW \
+		infiniband/mlx5dv.h \
+		enum MLX5_OPCODE_ENHANCED_MPSW \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_MLX5_OPCODE_SEND_EN \
+		infiniband/mlx5dv.h \
+		enum MLX5_OPCODE_SEND_EN \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_MLX5_OPCODE_WAIT \
+		infiniband/mlx5dv.h \
+		enum MLX5_OPCODE_WAIT \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_ETHTOOL_LINK_MODE_25G \
 		/usr/include/linux/ethtool.h \
 		enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \
diff --git a/drivers/common/mlx5/linux/meson.build b/drivers/common/mlx5/linux/meson.build
index 2294213..6116b5e 100644
--- a/drivers/common/mlx5/linux/meson.build
+++ b/drivers/common/mlx5/linux/meson.build
@@ -101,6 +101,8 @@ has_sym_args = [
 	'mlx5dv_devx_obj_query_async' ],
 	[ 'HAVE_IBV_DEVX_QP', 'infiniband/mlx5dv.h',
 	'mlx5dv_devx_qp_query' ],
+	[ 'HAVE_MLX5DV_PP_ALLOC', 'infiniband/mlx5dv.h',
+       	'mlx5dv_pp_alloc' ],
 	[ 'HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR', 'infiniband/mlx5dv.h',
 	'mlx5dv_dr_action_create_dest_devx_tir' ],
 	[ 'HAVE_IBV_DEVX_EVENT', 'infiniband/mlx5dv.h',
@@ -116,6 +118,12 @@ has_sym_args = [
 	[ 'HAVE_MLX5DV_DR_VLAN', 'infiniband/mlx5dv.h',
 	'mlx5dv_dr_action_create_push_vlan' ],
 	[ 'HAVE_IBV_VAR', 'infiniband/mlx5dv.h', 'mlx5dv_alloc_var' ],
+	[ 'HAVE_MLX5_OPCODE_ENHANCED_MPSW', 'infiniband/mlx5dv.h',
+	'MLX5_OPCODE_ENHANCED_MPSW' ],
+	[ 'HAVE_MLX5_OPCODE_SEND_EN', 'infiniband/mlx5dv.h',
+	'MLX5_OPCODE_SEND_EN' ],
+	[ 'HAVE_MLX5_OPCODE_WAIT', 'infiniband/mlx5dv.h',
+	'MLX5_OPCODE_WAIT' ],
 	[ 'HAVE_SUPPORTED_40000baseKR4_Full', 'linux/ethtool.h',
 	'SUPPORTED_40000baseKR4_Full' ],
 	[ 'HAVE_SUPPORTED_40000baseCR4_Full', 'linux/ethtool.h',
diff --git a/drivers/common/mlx5/linux/mlx5_glue.c b/drivers/common/mlx5/linux/mlx5_glue.c
index 395519d..b61a28b 100644
--- a/drivers/common/mlx5/linux/mlx5_glue.c
+++ b/drivers/common/mlx5/linux/mlx5_glue.c
@@ -1195,7 +1195,6 @@
 #endif
 }
 
-
 static void
 mlx5_glue_dr_reclaim_domain_memory(void *domain, uint32_t enable)
 {
@@ -1207,6 +1206,34 @@
 #endif
 }
 
+static struct mlx5dv_pp *
+mlx5_glue_dv_alloc_pp(struct ibv_context *context,
+		      size_t pp_context_sz,
+		      const void *pp_context,
+		      uint32_t flags)
+{
+#ifdef HAVE_MLX5DV_PP_ALLOC
+	return mlx5dv_pp_alloc(context, pp_context_sz, pp_context, flags);
+#else
+	RTE_SET_USED(context);
+	RTE_SET_USED(pp_context_sz);
+	RTE_SET_USED(pp_context);
+	RTE_SET_USED(flags);
+	errno = ENOTSUP;
+	return NULL;
+#endif
+}
+
+static void
+mlx5_glue_dv_free_pp(struct mlx5dv_pp *pp)
+{
+#ifdef HAVE_MLX5DV_PP_ALLOC
+	return mlx5dv_pp_free(pp);
+#else
+	RTE_SET_USED(pp);
+#endif
+}
+
 __rte_cache_aligned
 const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue) {
 	.version = MLX5_GLUE_VERSION,
@@ -1319,4 +1346,6 @@
 	.devx_free_uar = mlx5_glue_devx_free_uar,
 	.dv_alloc_var = mlx5_glue_dv_alloc_var,
 	.dv_free_var = mlx5_glue_dv_free_var,
+	.dv_alloc_pp = mlx5_glue_dv_alloc_pp,
+	.dv_free_pp = mlx5_glue_dv_free_pp,
 };
diff --git a/drivers/common/mlx5/linux/mlx5_glue.h b/drivers/common/mlx5/linux/mlx5_glue.h
index 069d854..c4f9b00 100644
--- a/drivers/common/mlx5/linux/mlx5_glue.h
+++ b/drivers/common/mlx5/linux/mlx5_glue.h
@@ -304,6 +304,11 @@ struct mlx5_glue {
 			 struct mlx5dv_devx_async_event_hdr *event_data,
 			 size_t event_resp_len);
 	void (*dr_reclaim_domain_memory)(void *domain, uint32_t enable);
+	struct mlx5dv_pp *(*dv_alloc_pp)(struct ibv_context *context,
+					 size_t pp_context_sz,
+					 const void *pp_context,
+					 uint32_t flags);
+	void (*dv_free_pp)(struct mlx5dv_pp *pp);
 };
 
 extern const struct mlx5_glue *mlx5_glue;
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 2179a83..093636c 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -467,6 +467,14 @@ struct mlx5_devx_obj *
 	attr->vdpa.queue_counters_valid = !!(MLX5_GET64(cmd_hca_cap, hcattr,
 							general_obj_types) &
 				  MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_Q_COUNTERS);
+	attr->wqe_index_ignore = MLX5_GET(cmd_hca_cap, hcattr,
+					  wqe_index_ignore_cap);
+	attr->cross_channel = MLX5_GET(cmd_hca_cap, hcattr, cd);
+	attr->non_wire_sq = MLX5_GET(cmd_hca_cap, hcattr, non_wire_sq);
+	attr->log_max_static_sq_wq = MLX5_GET(cmd_hca_cap, hcattr,
+					      log_max_static_sq_wq);
+	attr->dev_freq_khz = MLX5_GET(cmd_hca_cap, hcattr,
+				      device_frequency_khz);
 	if (attr->qos.sup) {
 		MLX5_SET(query_hca_cap_in, in, op_mod,
 			 MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP |
@@ -487,9 +495,13 @@ struct mlx5_devx_obj *
 		attr->qos.log_max_flow_meter =
 				MLX5_GET(qos_cap, hcattr, log_max_flow_meter);
 		attr->qos.flow_meter_reg_c_ids =
-			MLX5_GET(qos_cap, hcattr, flow_meter_reg_id);
+				MLX5_GET(qos_cap, hcattr, flow_meter_reg_id);
 		attr->qos.flow_meter_reg_share =
-			MLX5_GET(qos_cap, hcattr, flow_meter_reg_share);
+				MLX5_GET(qos_cap, hcattr, flow_meter_reg_share);
+		attr->qos.packet_pacing =
+				MLX5_GET(qos_cap, hcattr, packet_pacing);
+		attr->qos.wqe_rate_pp =
+				MLX5_GET(qos_cap, hcattr, wqe_rate_pp);
 	}
 	if (attr->vdpa.valid)
 		mlx5_devx_cmd_query_hca_vdpa_attr(ctx, &attr->vdpa);
@@ -971,6 +983,8 @@ struct mlx5_devx_obj *
 	MLX5_SET(sqc, sq_ctx, reg_umr, sq_attr->reg_umr);
 	MLX5_SET(sqc, sq_ctx, allow_swp, sq_attr->allow_swp);
 	MLX5_SET(sqc, sq_ctx, hairpin, sq_attr->hairpin);
+	MLX5_SET(sqc, sq_ctx, non_wire, sq_attr->non_wire);
+	MLX5_SET(sqc, sq_ctx, static_sq_wq, sq_attr->static_sq_wq);
 	MLX5_SET(sqc, sq_ctx, user_index, sq_attr->user_index);
 	MLX5_SET(sqc, sq_ctx, cqn, sq_attr->cqn);
 	MLX5_SET(sqc, sq_ctx, packet_pacing_rate_limit_index,
@@ -1185,6 +1199,7 @@ struct mlx5_devx_obj *
 	} else {
 		MLX5_SET64(cqc, cqctx, dbr_addr, attr->db_addr);
 	}
+	MLX5_SET(cqc, cqctx, cqe_sz, attr->cqe_size);
 	MLX5_SET(cqc, cqctx, cc, attr->use_first_only);
 	MLX5_SET(cqc, cqctx, oi, attr->overrun_ignore);
 	MLX5_SET(cqc, cqctx, log_cq_size, attr->log_cq_size);
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index 25704ef..c79b349 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -29,6 +29,8 @@ struct mlx5_devx_mkey_attr {
 struct mlx5_hca_qos_attr {
 	uint32_t sup:1;	/* Whether QOS is supported. */
 	uint32_t srtcm_sup:1; /* Whether srTCM mode is supported. */
+	uint32_t packet_pacing:1; /* Packet pacing is supported. */
+	uint32_t wqe_rate_pp:1; /* Packet pacing WQE rate mode. */
 	uint32_t flow_meter_reg_share:1;
 	/* Whether reg_c share is supported. */
 	uint8_t log_max_flow_meter;
@@ -90,6 +92,11 @@ struct mlx5_hca_attr {
 	uint32_t vhca_id:16;
 	uint32_t relaxed_ordering_write:1;
 	uint32_t relaxed_ordering_read:1;
+	uint32_t wqe_index_ignore:1;
+	uint32_t cross_channel:1;
+	uint32_t non_wire_sq:1; /* SQ with non-wire ops is supported. */
+	uint32_t log_max_static_sq_wq:5; /* Static WQE size SQ. */
+	uint32_t dev_freq_khz; /* Timestamp counter frequency, kHz. */
 	struct mlx5_hca_qos_attr qos;
 	struct mlx5_hca_vdpa_attr vdpa;
 };
@@ -207,6 +214,8 @@ struct mlx5_devx_create_sq_attr {
 	uint32_t reg_umr:1;
 	uint32_t allow_swp:1;
 	uint32_t hairpin:1;
+	uint32_t non_wire:1;
+	uint32_t static_sq_wq:1;
 	uint32_t user_index:24;
 	uint32_t cqn:24;
 	uint32_t packet_pacing_rate_limit_index:16;
@@ -230,6 +239,7 @@ struct mlx5_devx_cq_attr {
 	uint32_t db_umem_valid:1;
 	uint32_t use_first_only:1;
 	uint32_t overrun_ignore:1;
+	uint32_t cqe_size:3;
 	uint32_t log_cq_size:5;
 	uint32_t log_page_size:5;
 	uint32_t uar_page_id;
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index c63795f..8705b42 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -41,6 +41,10 @@
 /* Invalidate a CQE. */
 #define MLX5_CQE_INVALIDATE (MLX5_CQE_INVALID << 4)
 
+/* Hardware index widths. */
+#define MLX5_CQ_INDEX_WIDTH 24
+#define	MLX5_WQ_INDEX_WIDTH 16
+
 /* WQE Segment sizes in bytes. */
 #define MLX5_WSEG_SIZE 16u
 #define MLX5_WQE_CSEG_SIZE sizeof(struct mlx5_wqe_cseg)
@@ -126,7 +130,17 @@
 				  MLX5_ESEG_MIN_INLINE_SIZE)
 
 /* Missed in mlv5dv.h, should define here. */
+#ifndef HAVE_MLX5_OPCODE_ENHANCED_MPSW
 #define MLX5_OPCODE_ENHANCED_MPSW 0x29u
+#endif
+
+#ifndef HAVE_MLX5_OPCODE_SEND_EN
+#define MLX5_OPCODE_SEND_EN 0x17u
+#endif
+
+#ifndef HAVE_MLX5_OPCODE_WAIT
+#define MLX5_OPCODE_WAIT 0x0fu
+#endif
 
 /* CQE value to inform that VLAN is stripped. */
 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
@@ -255,6 +269,9 @@
 /* The alignment needed for WQ buffer. */
 #define MLX5_WQE_BUF_ALIGNMENT sysconf(_SC_PAGESIZE)
 
+/* The alignment needed for CQ buffer. */
+#define MLX5_CQE_BUF_ALIGNMENT sysconf(_SC_PAGESIZE)
+
 /* Completion mode. */
 enum mlx5_completion_mode {
 	MLX5_COMP_ONLY_ERR = 0x0,
@@ -314,6 +331,13 @@ struct mlx5_wqe_eseg {
 	};
 } __rte_packed;
 
+struct mlx5_wqe_qseg {
+	uint32_t reserved0;
+	uint32_t reserved1;
+	uint32_t max_index;
+	uint32_t qpn_cqn;
+} __rte_packed;
+
 /* The title WQEBB, header of WQE. */
 struct mlx5_wqe {
 	union {
@@ -373,6 +397,14 @@ struct mlx5_cqe {
 	uint8_t op_own;
 };
 
+struct mlx5_cqe_ts {
+	uint64_t timestamp;
+	uint32_t sop_drop_qpn;
+	uint16_t wqe_counter;
+	uint8_t rsvd5;
+	uint8_t op_own;
+};
+
 /* Adding direct verbs to data-path. */
 
 /* CQ sequence number mask. */
@@ -992,7 +1024,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 reserved_at_40[0x40];
 	u8 log_max_srq_sz[0x8];
 	u8 log_max_qp_sz[0x8];
-	u8 reserved_at_90[0xb];
+	u8 reserved_at_90[0x9];
+	u8 wqe_index_ignore_cap[0x1];
+	u8 dynamic_qp_allocation[0x1];
 	u8 log_max_qp[0x5];
 	u8 reserved_at_a0[0xb];
 	u8 log_max_srq[0x5];
@@ -1018,9 +1052,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8 umr_extended_translation_offset[0x1];
 	u8 null_mkey[0x1];
 	u8 log_max_klm_list_size[0x6];
-	u8 reserved_at_120[0xa];
+	u8 non_wire_sq[0x1];
+	u8 reserved_at_121[0x9];
 	u8 log_max_ra_req_dc[0x6];
-	u8 reserved_at_130[0xa];
+	u8 reserved_at_130[0x3];
+	u8 log_max_static_sq_wq[0x5];
+	u8 reserved_at_138[0x2];
 	u8 log_max_ra_res_dc[0x6];
 	u8 reserved_at_140[0xa];
 	u8 log_max_ra_req_qp[0x6];
@@ -1271,7 +1308,8 @@ struct mlx5_ifc_qos_cap_bits {
 	u8 reserved_at_8[0x8];
 	u8 log_max_flow_meter[0x8];
 	u8 flow_meter_reg_id[0x8];
-	u8 reserved_at_25[0x8];
+	u8 wqe_rate_pp[0x1];
+	u8 reserved_at_25[0x7];
 	u8 flow_meter_reg_share[0x1];
 	u8 reserved_at_2e[0x17];
 	u8 packet_pacing_max_rate[0x20];
@@ -1835,7 +1873,9 @@ struct mlx5_ifc_sqc_bits {
 	u8 reg_umr[0x1];
 	u8 allow_swp[0x1];
 	u8 hairpin[0x1];
-	u8 reserved_at_f[0x11];
+	u8 non_wire[0x1];
+	u8 static_sq_wq[0x1];
+	u8 reserved_at_11[0xf];
 	u8 reserved_at_20[0x8];
 	u8 user_index[0x18];
 	u8 reserved_at_40[0x8];
@@ -1935,6 +1975,11 @@ struct mlx5_ifc_flow_meter_parameters_bits {
 	u8         reserved_at_8[0x60];		// 14h-1Ch
 };
 
+enum {
+	MLX5_CQE_SIZE_64B = 0x0,
+	MLX5_CQE_SIZE_128B = 0x1,
+};
+
 struct mlx5_ifc_cqc_bits {
 	u8 status[0x4];
 	u8 as_notify[0x1];
@@ -2486,6 +2531,20 @@ struct mlx5_ifc_query_qp_in_bits {
 	u8 reserved_at_60[0x20];
 };
 
+enum {
+	MLX5_DATA_RATE = 0x0,
+	MLX5_WQE_RATE = 0x1,
+};
+
+struct mlx5_ifc_set_pp_rate_limit_context_bits {
+	u8 rate_limit[0x20];
+	u8 burst_upper_bound[0x20];
+	u8 reserved_at_40[0xC];
+	u8 rate_mode[0x4];
+	u8 typical_packet_size[0x10];
+	u8 reserved_at_60[0x120];
+};
+
 /* CQE format mask. */
 #define MLX5E_CQE_FORMAT_MASK 0xc
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 02/16] net/mlx5: introduce send scheduling devargs
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 03/16] net/mlx5: fix UAR lock sharing for multiport devices Viacheslav Ovsiienko
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

This patch introduces the new devargs:

tx_pp - enables accurate packet send scheduling on mbuf timestamps
  in the PMD. On the device start if "rte_dynflag_timestamp"
  dynamic flag is registered and this devarg non-zero value is
  specified, the driver initializes all necessary internal
  infrastructure to provide packet scheduling. The parameter
  value specifies scheduling granularity in nanoseconds.

tx_skew - the parameter adjusts the send packet scheduling on
  timestamps and represents the average delay between beginning
  of the transmitting descriptor processing by the hardware and
  appearance of actual packet data on the wire. The value should
  be provided in nanoseconds and is valid only if tx_pp parameter
  is specified. The default value is zero.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 doc/guides/nics/mlx5.rst         | 37 ++++++++++++++++++++++++++
 drivers/net/mlx5/linux/mlx5_os.c | 57 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5.c          | 39 ++++++++++++++++++++++++---
 drivers/net/mlx5/mlx5.h          |  2 ++
 4 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index b51aa67..6b06d16 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -241,6 +241,24 @@ Limitations
   reduce the requested Tx size or adjust data inline settings with
   ``txq_inline_max`` and ``txq_inline_mpw`` devargs keys.
 
+- To provide the packet send scheduling on mbuf timestamps the ``tx_pp``
+  parameter should be specified, RTE_MBUF_DYNFIELD_TIMESTAMP_NAME and
+  RTE_MBUF_DYNFLAG_TIMESTAMP_NAME should be registered by application.
+  When PMD sees the RTE_MBUF_DYNFLAG_TIMESTAMP_NAME set on the packet
+  being sent it tries to synchronize the time of packet appearing on
+  the wire with the specified packet timestamp. It the specified one
+  is in the past it should be ignored, if one is in the distant future
+  it should be capped with some reasonable value (in range of seconds).
+  These specific cases ("too late" and "distant future") can be optionally
+  reported via device xstats to assist applications to detect the
+  time-related problems.
+
+  There is no any packet reordering according timestamps is supposed,
+  neither within packet burst, nor between packets, it is an entirely
+  application responsibility to generate packets and its timestamps
+  in desired order. The timestamps can be put only in the first packet
+  in the burst providing the entire burst scheduling.
+
 - E-Switch decapsulation Flow:
 
   - can be applied to PF port only.
@@ -700,6 +718,25 @@ Driver options
   variable "MLX5_SHUT_UP_BF" value is used. If there is no "MLX5_SHUT_UP_BF",
   the default ``tx_db_nc`` value is zero for ARM64 hosts and one for others.
 
+- ``tx_pp`` parameter [int]
+
+  If a nonzero value is specified the driver creates all necessary internal
+  objects to provide accurate packet send scheduling on mbuf timestamps.
+  The positive value specifies the scheduling granularity in nanoseconds,
+  the packet send will be accurate up to specified digits. The allowed range is
+  from 500 to 1 million of nanoseconds. The negative value specifies the module
+  of granularity and engages the special test mode the check the schedule rate.
+  By default (if the ``tx_pp`` is not specified) send scheduling on timestamps
+  feature is disabled.
+
+- ``tx_skew`` parameter [int]
+
+  The parameter adjusts the send packet scheduling on timestamps and represents
+  the daverage delay between beginning of the transmitting descriptor processing
+  by the hardware and appearance of actual packet data on the wire. The value
+  should be provided in nanoseconds and is valid only if ``tx_pp`` parameter is
+  specified. The default value is zero.
+
 - ``tx_vec_en`` parameter [int]
 
   A nonzero value enables Tx vector on ConnectX-5, ConnectX-6, ConnectX-6 Dx
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 2dc57b2..daccd1c 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -879,6 +879,63 @@
 		}
 #endif
 	}
+	if (config.tx_pp) {
+		DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz",
+			config.hca_attr.dev_freq_khz);
+		DRV_LOG(DEBUG, "Packet pacing is %ssupported",
+			config.hca_attr.qos.packet_pacing ? "" : "not ");
+		DRV_LOG(DEBUG, "Cross channel ops are %ssupported",
+			config.hca_attr.cross_channel ? "" : "not ");
+		DRV_LOG(DEBUG, "WQE index ignore is %ssupported",
+			config.hca_attr.wqe_index_ignore ? "" : "not ");
+		DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported",
+			config.hca_attr.non_wire_sq ? "" : "not ");
+		DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)",
+			config.hca_attr.log_max_static_sq_wq ? "" : "not ",
+			config.hca_attr.log_max_static_sq_wq);
+		DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported",
+			config.hca_attr.qos.wqe_rate_pp ? "" : "not ");
+		if (!config.devx) {
+			DRV_LOG(ERR, "DevX is required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config.hca_attr.qos.packet_pacing) {
+			DRV_LOG(ERR, "Packet pacing is not supported");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config.hca_attr.cross_channel) {
+			DRV_LOG(ERR, "Cross channel operations are"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config.hca_attr.wqe_index_ignore) {
+			DRV_LOG(ERR, "WQE index ignore feature is"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config.hca_attr.non_wire_sq) {
+			DRV_LOG(ERR, "Non-wire SQ feature is"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config.hca_attr.log_max_static_sq_wq) {
+			DRV_LOG(ERR, "Static WQE SQ feature is"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config.hca_attr.qos.wqe_rate_pp) {
+			DRV_LOG(ERR, "WQE rate mode is required"
+				     " for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+	}
 	if (config.mprq.enabled && mprq) {
 		if (config.mprq.stride_num_n &&
 		    (config.mprq.stride_num_n > mprq_max_stride_num_n ||
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 86b7671..13242a5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -120,6 +120,19 @@
 #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
 
 /*
+ * Device parameter to enable Tx scheduling on timestamps
+ * andspecify the packet pacing granularity in nanoseconds.
+ */
+#define MLX5_TX_PP "tx_pp"
+
+/*
+ * Device parameter to specify skew in nanoseconds on Tx datapath,
+ * it represents the time between SQ start WQE processing and
+ * appearing actual packet data on the wire.
+ */
+#define MLX5_TX_SKEW "tx_skew"
+
+/*
  * Device parameter to enable hardware Tx vector.
  * Deprecated, ignored (no vectorized Tx routines anymore).
  */
@@ -1271,18 +1284,26 @@ struct mlx5_dev_ctx_shared *
 mlx5_args_check(const char *key, const char *val, void *opaque)
 {
 	struct mlx5_dev_config *config = opaque;
-	unsigned long tmp;
+	unsigned long mod;
+	signed long tmp;
 
 	/* No-op, port representors are processed in mlx5_dev_spawn(). */
 	if (!strcmp(MLX5_REPRESENTOR, key))
 		return 0;
 	errno = 0;
-	tmp = strtoul(val, NULL, 0);
+	tmp = strtol(val, NULL, 0);
 	if (errno) {
 		rte_errno = errno;
 		DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val);
 		return -rte_errno;
 	}
+	if (tmp < 0 && strcmp(MLX5_TX_PP, key) && strcmp(MLX5_TX_SKEW, key)) {
+		/* Negative values are acceptable for some keys only. */
+		rte_errno = EINVAL;
+		DRV_LOG(WARNING, "%s: invalid negative value \"%s\"", key, val);
+		return -rte_errno;
+	}
+	mod = tmp >= 0 ? tmp : -tmp;
 	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
 		config->cqe_comp = !!tmp;
 	} else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
@@ -1333,6 +1354,15 @@ struct mlx5_dev_ctx_shared *
 		config->txq_inline_mpw = tmp;
 	} else if (strcmp(MLX5_TX_VEC_EN, key) == 0) {
 		DRV_LOG(WARNING, "%s: deprecated parameter, ignored", key);
+	} else if (strcmp(MLX5_TX_PP, key) == 0) {
+		if (!mod) {
+			DRV_LOG(ERR, "Zero Tx packet pacing parameter");
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
+		config->tx_pp = tmp;
+	} else if (strcmp(MLX5_TX_SKEW, key) == 0) {
+		config->tx_skew = tmp;
 	} else if (strcmp(MLX5_RX_VEC_EN, key) == 0) {
 		config->rx_vec_en = !!tmp;
 	} else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) {
@@ -1415,6 +1445,8 @@ struct mlx5_dev_ctx_shared *
 		MLX5_TXQ_MPW_HDR_DSEG_EN,
 		MLX5_TXQ_MAX_INLINE_LEN,
 		MLX5_TX_DB_NC,
+		MLX5_TX_PP,
+		MLX5_TX_SKEW,
 		MLX5_TX_VEC_EN,
 		MLX5_RX_VEC_EN,
 		MLX5_L3_VXLAN_EN,
@@ -1693,7 +1725,8 @@ struct mlx5_dev_ctx_shared *
 {
 	static const char *const dynf_names[] = {
 		RTE_PMD_MLX5_FINE_GRANULARITY_INLINE,
-		RTE_MBUF_DYNFLAG_METADATA_NAME
+		RTE_MBUF_DYNFLAG_METADATA_NAME,
+		RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME
 	};
 	unsigned int i;
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 46e66eb..84cd3e1 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -240,6 +240,8 @@ struct mlx5_dev_config {
 	int txq_inline_min; /* Minimal amount of data bytes to inline. */
 	int txq_inline_max; /* Max packet size for inlining with SEND. */
 	int txq_inline_mpw; /* Max packet size for inlining with eMPW. */
+	int tx_pp; /* Timestamp scheduling granularity in nanoseconds. */
+	int tx_skew; /* Tx scheduling skew between WQE and data on wire. */
 	struct mlx5_hca_attr hca_attr; /* HCA attributes. */
 	struct mlx5_lro_config lro; /* LRO configuration. */
 };
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 03/16] net/mlx5: fix UAR lock sharing for multiport devices
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 02/16] net/mlx5: introduce send scheduling devargs Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 04/16] net/mlx5: introduce shared UAR resource Viacheslav Ovsiienko
                   ` (12 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit, stable

The master and representors might be created over the multiport
Infiniband devices and the UAR resource allocated for sibling
ports might belong to the same underlying Infiniband device.
Hardware requires the write access to the UAR must be performed
as atomic 64-bit write, on 32-bit systems this is two sequential
writes, protected by lock. Due to possibility to share the same
UAR between sibling devices the locks must be moved to shared
context.

Fixes: f048f3d479a6 ("net/mlx5: switch to the shared IB device context")
Cc: stable@dpdk.org

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/linux/mlx5_os.c |  6 ------
 drivers/net/mlx5/mlx5.c          |  6 ++++++
 drivers/net/mlx5/mlx5.h          | 10 +++++-----
 drivers/net/mlx5/mlx5_rxq.c      |  2 +-
 drivers/net/mlx5/mlx5_txq.c      |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index daccd1c..7abb85d 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -630,12 +630,6 @@
 	priv->mtu = RTE_ETHER_MTU;
 	priv->mp_id.port_id = port_id;
 	strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
-#ifndef RTE_ARCH_64
-	/* Initialize UAR access locks for 32bit implementations. */
-	rte_spinlock_init(&priv->uar_lock_cq);
-	for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
-		rte_spinlock_init(&priv->uar_lock[i]);
-#endif
 	/* Some internal functions rely on Netlink sockets, open them now. */
 	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
 	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 13242a5..2efbc03 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -717,6 +717,12 @@ struct mlx5_dev_ctx_shared *
 		err = ENOMEM;
 		goto error;
 	}
+#ifndef RTE_ARCH_64
+	/* Initialize UAR access locks for 32bit implementations. */
+	rte_spinlock_init(&sh->uar_lock_cq);
+	for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
+		rte_spinlock_init(&sh->uar_lock[i]);
+#endif
 	/*
 	 * Once the device is added to the list of memory event
 	 * callback, its global MR cache table cannot be expanded
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 84cd3e1..d01d7f3 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -559,6 +559,11 @@ struct mlx5_dev_ctx_shared {
 	void *fdb_domain; /* FDB Direct Rules name space handle. */
 	void *rx_domain; /* RX Direct Rules name space handle. */
 	void *tx_domain; /* TX Direct Rules name space handle. */
+#ifndef RTE_ARCH_64
+	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
+	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
+	/* UAR same-page access control required in 32bit implementations. */
+#endif
 	struct mlx5_hlist *flow_tbls;
 	/* Direct Rules tables for FDB, NIC TX+RX */
 	void *esw_drop_action; /* Pointer to DR E-Switch drop action. */
@@ -673,11 +678,6 @@ struct mlx5_priv {
 	uint8_t mtr_color_reg; /* Meter color match REG_C. */
 	struct mlx5_mtr_profiles flow_meter_profiles; /* MTR profile list. */
 	struct mlx5_flow_meters flow_meters; /* MTR list. */
-#ifndef RTE_ARCH_64
-	rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
-	rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
-	/* UAR same-page access control required in 32bit implementations. */
-#endif
 	uint8_t skip_default_rss_reta; /* Skip configuration of default reta. */
 	uint8_t fdb_def_rule; /* Whether fdb jump to table 1 is configured. */
 	struct mlx5_mp_id mp_id; /* ID of a multi-process process */
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index b436f06..2681322 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1997,7 +1997,7 @@ struct mlx5_rxq_ctrl *
 	tmpl->rxq.elts =
 		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
 #ifndef RTE_ARCH_64
-	tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq;
+	tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;
 #endif
 	tmpl->rxq.idx = idx;
 	rte_atomic32_inc(&tmpl->refcnt);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 35b3ade..e1fa24e 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -355,7 +355,7 @@
 	/* Assign an UAR lock according to UAR page number */
 	lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
 		   MLX5_UAR_PAGE_NUM_MASK;
-	txq_ctrl->txq.uar_lock = &priv->uar_lock[lock_idx];
+	txq_ctrl->txq.uar_lock = &priv->sh->uar_lock[lock_idx];
 #endif
 }
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 04/16] net/mlx5: introduce shared UAR resource
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 02/16] net/mlx5: introduce send scheduling devargs Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 03/16] net/mlx5: fix UAR lock sharing for multiport devices Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 05/16] net/mlx5: create clock queue for packet pacing Viacheslav Ovsiienko
                   ` (11 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

This is preparation step before moving the Tx queue creation
to the DevX approach. Some features require the shared UAR
for Tx queues and scheduling completion queues, the patch
manages the shared UAR.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.c | 14 ++++++++++++++
 drivers/net/mlx5/mlx5.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 2efbc03..612d38c 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -709,6 +709,12 @@ struct mlx5_dev_ctx_shared *
 			err = ENOMEM;
 			goto error;
 		}
+		sh->tx_uar = mlx5_glue->devx_alloc_uar(sh->ctx, 0);
+		if (!sh->tx_uar) {
+			DRV_LOG(ERR, "Failed to allocate DevX UAR.");
+			err = ENOMEM;
+			goto error;
+		}
 	}
 	sh->flow_id_pool = mlx5_flow_id_pool_alloc
 					((1 << HAIRPIN_FLOW_ID_BITS) - 1);
@@ -767,6 +773,10 @@ struct mlx5_dev_ctx_shared *
 		mlx5_l3t_destroy(sh->cnt_id_tbl);
 		sh->cnt_id_tbl = NULL;
 	}
+	if (sh->tx_uar) {
+		mlx5_glue->devx_free_uar(sh->tx_uar);
+		sh->tx_uar = NULL;
+	}
 	if (sh->tis)
 		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
 	if (sh->td)
@@ -832,6 +842,10 @@ struct mlx5_dev_ctx_shared *
 		mlx5_l3t_destroy(sh->cnt_id_tbl);
 		sh->cnt_id_tbl = NULL;
 	}
+	if (sh->tx_uar) {
+		mlx5_glue->devx_free_uar(sh->tx_uar);
+		sh->tx_uar = NULL;
+	}
 	if (sh->pd)
 		claim_zero(mlx5_glue->dealloc_pd(sh->pd));
 	if (sh->tis)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index d01d7f3..799b8e3 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -586,6 +586,7 @@ struct mlx5_dev_ctx_shared {
 	struct mlx5_devx_obj *tis; /* TIS object. */
 	struct mlx5_devx_obj *td; /* Transport domain. */
 	struct mlx5_flow_id_pool *flow_id_pool; /* Flow ID pool. */
+	struct mlx5dv_devx_uar *tx_uar; /* Tx/packer pacing  shared UAR. */
 	struct mlx5_dev_shared_port port[]; /* per device port data array. */
 };
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 05/16] net/mlx5: create clock queue for packet pacing
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (2 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 04/16] net/mlx5: introduce shared UAR resource Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 06/16] net/mlx5: create rearm " Viacheslav Ovsiienko
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

This patch creates the special completion queue providing
reference completions to schedule packet send from
other transmitting queues.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/Makefile        |   1 +
 drivers/net/mlx5/linux/mlx5_os.c |   3 +
 drivers/net/mlx5/meson.build     |   1 +
 drivers/net/mlx5/mlx5.c          |   2 +
 drivers/net/mlx5/mlx5.h          |  47 +++++
 drivers/net/mlx5/mlx5_defs.h     |   7 +
 drivers/net/mlx5/mlx5_trigger.c  |  16 +-
 drivers/net/mlx5/mlx5_txpp.c     | 446 +++++++++++++++++++++++++++++++++++++++
 8 files changed, 518 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/mlx5/mlx5_txpp.c

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index a458402..9eaac6b 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -11,6 +11,7 @@ LIB = librte_pmd_mlx5.a
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxq.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txq.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_txpp.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rxtx.c
 ifneq ($(filter y,$(CONFIG_RTE_ARCH_X86_64) \
 			$(CONFIG_RTE_ARCH_PPC_64) \
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 7abb85d..ff93095 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1863,6 +1863,9 @@
 {
 	int dbmap_env;
 	int err = 0;
+
+	sh->numa_node = spawn->pci_dev->device.numa_node;
+	pthread_mutex_init(&sh->txpp.mutex, NULL);
 	/*
 	 * Configure environment variable "MLX5_BF_SHUT_UP"
 	 * before the device creation. The rdma_core library
diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index e95ce02..c06b153 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -26,6 +26,7 @@ sources = files(
 	'mlx5_stats.c',
 	'mlx5_trigger.c',
 	'mlx5_txq.c',
+	'mlx5_txpp.c',
 	'mlx5_vlan.c',
 	'mlx5_utils.c',
 )
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 612d38c..ee721fd 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -767,6 +767,7 @@ struct mlx5_dev_ctx_shared *
 	pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
 	return sh;
 error:
+	pthread_mutex_destroy(&sh->txpp.mutex);
 	pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
 	MLX5_ASSERT(sh);
 	if (sh->cnt_id_tbl) {
@@ -856,6 +857,7 @@ struct mlx5_dev_ctx_shared *
 		claim_zero(mlx5_glue->close_device(sh->ctx));
 	if (sh->flow_id_pool)
 		mlx5_flow_id_pool_release(sh->flow_id_pool);
+	pthread_mutex_destroy(&sh->txpp.mutex);
 	rte_free(sh);
 exit:
 	pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 799b8e3..be28d80 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -531,6 +531,44 @@ struct mlx5_flow_id_pool {
 	uint32_t max_id; /**< Maximum id can be allocated from the pool. */
 };
 
+/* Tx pacing queue structure - for Clock and Rearm queues. */
+struct mlx5_txpp_wq {
+	/* Completion Queue related data.*/
+	struct mlx5_devx_obj *cq;
+	struct mlx5dv_devx_umem *cq_umem;
+	union {
+		volatile void *cq_buf;
+		volatile struct mlx5_cqe *cqes;
+	};
+	volatile uint32_t *cq_dbrec;
+	uint32_t cq_ci:24;
+	uint32_t arm_sn:2;
+	/* Send Queue related data.*/
+	struct mlx5_devx_obj *sq;
+	struct mlx5dv_devx_umem *sq_umem;
+	union {
+		volatile void *sq_buf;
+		volatile struct mlx5_wqe *wqes;
+	};
+	uint16_t sq_size; /* Number of WQEs in the queue. */
+	uint16_t sq_ci; /* Next WQE to execute. */
+	volatile uint32_t *sq_dbrec;
+};
+
+/* Tx packet pacing structure. */
+struct mlx5_dev_txpp {
+	pthread_mutex_t mutex; /* Pacing create/destroy mutex. */
+	uint32_t refcnt; /* Pacing reference counter. */
+	uint32_t freq; /* Timestamp frequency, Hz. */
+	uint32_t tick; /* Completion tick duration in nanoseconds. */
+	uint32_t test; /* Packet pacing test mode. */
+	int32_t skew; /* Scheduling skew. */
+	uint32_t eqn; /* Event Queue number. */
+	struct rte_intr_handle intr_handle; /* Periodic interrupt. */
+	struct mlx5dv_devx_event_channel *echan; /* Event Channel. */
+	struct mlx5_txpp_wq clock_queue; /* Clock Queue. */
+};
+
 /*
  * Shared Infiniband device context for Master/Representors
  * which belong to same IB device with multiple IB ports.
@@ -547,9 +585,12 @@ struct mlx5_dev_ctx_shared {
 	char ibdev_name[DEV_SYSFS_NAME_MAX]; /* SYSFS dev name. */
 	char ibdev_path[DEV_SYSFS_PATH_MAX]; /* SYSFS dev path for secondary */
 	struct mlx5_dev_attr device_attr; /* Device properties. */
+	int numa_node; /* Numa node of backing physical device. */
 	LIST_ENTRY(mlx5_dev_ctx_shared) mem_event_cb;
 	/**< Called by memory event callback. */
 	struct mlx5_mr_share_cache share_cache;
+	/* Packet pacing related structure. */
+	struct mlx5_dev_txpp txpp;
 	/* Shared DV/DR flow data section. */
 	pthread_mutex_t dv_mutex; /* DV context mutex. */
 	uint32_t dv_meta_mask; /* flow META metadata supported mask. */
@@ -622,6 +663,7 @@ struct mlx5_priv {
 	unsigned int representor:1; /* Device is a port representor. */
 	unsigned int master:1; /* Device is a E-Switch master. */
 	unsigned int dr_shared:1; /* DV/DR data is shared. */
+	unsigned int txpp_en:1; /* Tx packet pacing enabled. */
 	unsigned int counter_fallback:1; /* Use counter fallback management. */
 	unsigned int mtr_en:1; /* Whether support meter. */
 	unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */
@@ -944,4 +986,9 @@ int mlx5_os_read_dev_stat(struct mlx5_priv *priv,
 void mlx5_os_stats_init(struct rte_eth_dev *dev);
 void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
 			   mlx5_dereg_mr_t *dereg_mr_cb);
+/* mlx5_txpp.c */
+
+int mlx5_txpp_start(struct rte_eth_dev *dev);
+void mlx5_txpp_stop(struct rte_eth_dev *dev);
+
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 260f584..fff11af 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -171,6 +171,13 @@
 #define MLX5_TXDB_NCACHED 1
 #define MLX5_TXDB_HEURISTIC 2
 
+/* Tx accurate scheduling on timestamps parameters. */
+#define MLX5_TXPP_CLKQ_SIZE 1
+/* The minimal size test packet to put into one WQE, padded by HW. */
+#define MLX5_TXPP_TEST_PKT_SIZE	(sizeof(struct rte_ether_hdr) +	\
+				 sizeof(struct rte_ipv4_hdr))
+
+
 /* Size of the simple hash table for metadata register table. */
 #define MLX5_FLOW_MREG_HTABLE_SZ 4096
 #define MLX5_FLOW_MREG_HNAME "MARK_COPY_TABLE"
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index ef74609..ca25ad9 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -288,25 +288,29 @@
 			return -rte_errno;
 		}
 	}
+	ret = mlx5_txpp_start(dev);
+	if (ret) {
+		DRV_LOG(ERR, "port %u Tx packet pacing init failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		goto error;
+	}
 	ret = mlx5_txq_start(dev);
 	if (ret) {
 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
 			dev->data->port_id, strerror(rte_errno));
-		return -rte_errno;
+		goto error;
 	}
 	ret = mlx5_rxq_start(dev);
 	if (ret) {
 		DRV_LOG(ERR, "port %u Rx queue allocation failed: %s",
 			dev->data->port_id, strerror(rte_errno));
-		mlx5_txq_stop(dev);
-		return -rte_errno;
+		goto error;
 	}
 	ret = mlx5_hairpin_bind(dev);
 	if (ret) {
 		DRV_LOG(ERR, "port %u hairpin binding failed: %s",
 			dev->data->port_id, strerror(rte_errno));
-		mlx5_txq_stop(dev);
-		return -rte_errno;
+		goto error;
 	}
 	/* Set started flag here for the following steps like control flow. */
 	dev->data->dev_started = 1;
@@ -362,6 +366,7 @@
 	mlx5_traffic_disable(dev);
 	mlx5_txq_stop(dev);
 	mlx5_rxq_stop(dev);
+	mlx5_txpp_stop(dev); /* Stop last. */
 	rte_errno = ret; /* Restore rte_errno. */
 	return -rte_errno;
 }
@@ -398,6 +403,7 @@
 	priv->sh->port[priv->dev_port - 1].devx_ih_port_id = RTE_MAX_ETHPORTS;
 	mlx5_txq_stop(dev);
 	mlx5_rxq_stop(dev);
+	mlx5_txpp_stop(dev);
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
new file mode 100644
index 0000000..7f8a6c4
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -0,0 +1,446 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+#include <rte_ether.h>
+#include <rte_ethdev_driver.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+#include <rte_malloc.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+
+/* Destroy Event Queue Notification Channel. */
+static void
+mlx5_txpp_destroy_eqn(struct mlx5_dev_ctx_shared *sh)
+{
+	if (sh->txpp.echan) {
+		mlx5_glue->devx_destroy_event_channel(sh->txpp.echan);
+		sh->txpp.echan = NULL;
+	}
+	sh->txpp.eqn = 0;
+}
+
+/* Create Event Queue Notification Channel. */
+static int
+mlx5_txpp_create_eqn(struct mlx5_dev_ctx_shared *sh)
+{
+	uint32_t lcore;
+
+	MLX5_ASSERT(!sh->txpp.echan);
+	lcore = (uint32_t)rte_lcore_to_cpu_id(-1);
+	if (mlx5_glue->devx_query_eqn(sh->ctx, lcore, &sh->txpp.eqn)) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno);
+		sh->txpp.eqn = 0;
+		return -rte_errno;
+	}
+	sh->txpp.echan = mlx5_glue->devx_create_event_channel(sh->ctx,
+			MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
+	if (!sh->txpp.echan) {
+		sh->txpp.eqn = 0;
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create event channel %d.",
+			rte_errno);
+		return -rte_errno;
+	}
+	return 0;
+}
+
+static void
+mlx5_txpp_destroy_clock_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+
+	if (wq->sq)
+		claim_zero(mlx5_devx_cmd_destroy(wq->sq));
+	if (wq->sq_umem)
+		claim_zero(mlx5_glue->devx_umem_dereg(wq->sq_umem));
+	if (wq->sq_buf)
+		rte_free((void *)(uintptr_t)wq->sq_buf);
+	if (wq->cq)
+		claim_zero(mlx5_devx_cmd_destroy(wq->cq));
+	if (wq->cq_umem)
+		claim_zero(mlx5_glue->devx_umem_dereg(wq->cq_umem));
+	if (wq->cq_buf)
+		rte_free((void *)(uintptr_t)wq->cq_buf);
+	memset(wq, 0, sizeof(*wq));
+}
+
+static void
+mlx5_txpp_fill_wqe_clock_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+	struct mlx5_wqe *wqe = (struct mlx5_wqe *)(uintptr_t)wq->wqes;
+	struct mlx5_wqe_cseg *cs = &wqe->cseg;
+	uint32_t wqe_size, opcode, i;
+	uint8_t *dst;
+
+	/* For test purposes fill the WQ with SEND inline packet. */
+	if (sh->txpp.test) {
+		wqe_size = RTE_ALIGN(MLX5_TXPP_TEST_PKT_SIZE +
+				     MLX5_WQE_CSEG_SIZE +
+				     2 * MLX5_WQE_ESEG_SIZE -
+				     MLX5_ESEG_MIN_INLINE_SIZE,
+				     MLX5_WSEG_SIZE);
+		opcode = MLX5_OPCODE_SEND;
+	} else {
+		wqe_size = MLX5_WSEG_SIZE;
+		opcode = MLX5_OPCODE_NOP;
+	}
+	cs->opcode = rte_cpu_to_be_32(opcode | 0); /* Index is ignored. */
+	cs->sq_ds = rte_cpu_to_be_32((wq->sq->id << 8) |
+				     (wqe_size / MLX5_WSEG_SIZE));
+	cs->flags = RTE_BE32(MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET);
+	cs->misc = RTE_BE32(0);
+	wqe_size = RTE_ALIGN(wqe_size, MLX5_WQE_SIZE);
+	if (sh->txpp.test) {
+		struct mlx5_wqe_eseg *es = &wqe->eseg;
+		struct rte_ether_hdr *eth_hdr;
+		struct rte_ipv4_hdr *ip_hdr;
+		struct rte_udp_hdr *udp_hdr;
+
+		/* Build the inline test packet pattern. */
+		MLX5_ASSERT(wqe_size <= MLX5_WQE_SIZE_MAX);
+		MLX5_ASSERT(MLX5_TXPP_TEST_PKT_SIZE >=
+				(sizeof(struct rte_ether_hdr) +
+				 sizeof(struct rte_ipv4_hdr)));
+		es->flags = 0;
+		es->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+		es->swp_offs = 0;
+		es->metadata = 0;
+		es->swp_flags = 0;
+		es->mss = 0;
+		es->inline_hdr_sz = RTE_BE16(MLX5_TXPP_TEST_PKT_SIZE);
+		/* Build test packet L2 header (Ethernet). */
+		dst = (uint8_t *)&es->inline_data;
+		eth_hdr = (struct rte_ether_hdr *)dst;
+		rte_eth_random_addr(&eth_hdr->d_addr.addr_bytes[0]);
+		rte_eth_random_addr(&eth_hdr->s_addr.addr_bytes[0]);
+		eth_hdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4);
+		/* Build test packet L3 header (IP v4). */
+		dst += sizeof(struct rte_ether_hdr);
+		ip_hdr = (struct rte_ipv4_hdr *)dst;
+		ip_hdr->version_ihl = RTE_IPV4_VHL_DEF;
+		ip_hdr->type_of_service = 0;
+		ip_hdr->fragment_offset = 0;
+		ip_hdr->time_to_live = 64;
+		ip_hdr->next_proto_id = IPPROTO_UDP;
+		ip_hdr->packet_id = 0;
+		ip_hdr->total_length = RTE_BE16(MLX5_TXPP_TEST_PKT_SIZE -
+						sizeof(struct rte_ether_hdr));
+		/* use RFC5735 / RFC2544 reserved network test addresses */
+		ip_hdr->src_addr = RTE_BE32((198U << 24) | (18 << 16) |
+					    (0 << 8) | 1);
+		ip_hdr->dst_addr = RTE_BE32((198U << 24) | (18 << 16) |
+					    (0 << 8) | 2);
+		if (MLX5_TXPP_TEST_PKT_SIZE <
+					(sizeof(struct rte_ether_hdr) +
+					 sizeof(struct rte_ipv4_hdr) +
+					 sizeof(struct rte_udp_hdr)))
+			goto wcopy;
+		/* Build test packet L4 header (UDP). */
+		dst += sizeof(struct rte_ipv4_hdr);
+		udp_hdr = (struct rte_udp_hdr *)dst;
+		udp_hdr->src_port = RTE_BE16(9); /* RFC863 Discard. */
+		udp_hdr->dst_port = RTE_BE16(9);
+		udp_hdr->dgram_len = RTE_BE16(MLX5_TXPP_TEST_PKT_SIZE -
+					      sizeof(struct rte_ether_hdr) -
+					      sizeof(struct rte_ipv4_hdr));
+		udp_hdr->dgram_cksum = 0;
+		/* Fill the test packet data. */
+		dst += sizeof(struct rte_udp_hdr);
+		for (i = sizeof(struct rte_ether_hdr) +
+			sizeof(struct rte_ipv4_hdr) +
+			sizeof(struct rte_udp_hdr);
+				i < MLX5_TXPP_TEST_PKT_SIZE; i++)
+			*dst++ = (uint8_t)(i & 0xFF);
+	}
+wcopy:
+	/* Duplicate the pattern to the next WQEs. */
+	dst = (uint8_t *)(uintptr_t)wq->sq_buf;
+	for (i = 1; i < MLX5_TXPP_CLKQ_SIZE; i++) {
+		dst += wqe_size;
+		rte_memcpy(dst, (void *)(uintptr_t)wq->sq_buf, wqe_size);
+	}
+}
+
+/* Creates the Clock Queue for packet pacing, returns zero on success. */
+static int
+mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_devx_create_sq_attr sq_attr = { 0 };
+	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
+	struct mlx5_devx_cq_attr cq_attr = { 0 };
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+	size_t page_size = sysconf(_SC_PAGESIZE);
+	uint32_t umem_size, umem_dbrec;
+	int ret;
+
+	/* Allocate memory buffer for CQEs and doorbell record. */
+	umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_CLKQ_SIZE;
+	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+	umem_size += MLX5_DBR_SIZE;
+	wq->cq_buf = rte_zmalloc_socket(__func__, umem_size,
+					page_size, sh->numa_node);
+	if (!wq->cq_buf) {
+		DRV_LOG(ERR, "Failed to allocate memory for Clock Queue.");
+		return -ENOMEM;
+	}
+	/* Register allocated buffer in user space with DevX. */
+	wq->cq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
+					       (void *)(uintptr_t)wq->cq_buf,
+					       umem_size,
+					       IBV_ACCESS_LOCAL_WRITE);
+	if (!wq->cq_umem) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to register umem for Clock Queue.");
+		goto error;
+	}
+	/* Create completion queue object for Clock Queue. */
+	cq_attr.cqe_size = (sizeof(struct mlx5_cqe) == 128) ?
+			    MLX5_CQE_SIZE_128B : MLX5_CQE_SIZE_64B;
+	cq_attr.use_first_only = 1;
+	cq_attr.overrun_ignore = 1;
+	cq_attr.uar_page_id = sh->tx_uar->page_id;
+	cq_attr.eqn = sh->txpp.eqn;
+	cq_attr.q_umem_valid = 1;
+	cq_attr.q_umem_offset = 0;
+	cq_attr.q_umem_id = wq->cq_umem->umem_id;
+	cq_attr.db_umem_valid = 1;
+	cq_attr.db_umem_offset = umem_dbrec;
+	cq_attr.db_umem_id = wq->cq_umem->umem_id;
+	cq_attr.log_cq_size = rte_log2_u32(MLX5_TXPP_CLKQ_SIZE);
+	cq_attr.log_page_size = rte_log2_u32(page_size);
+	wq->cq = mlx5_devx_cmd_create_cq(sh->ctx, &cq_attr);
+	if (!wq->cq) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create CQ for Clock Queue.");
+		goto error;
+	}
+	wq->cq_dbrec = RTE_PTR_ADD(wq->cq_buf, umem_dbrec);
+	wq->cq_ci = 0;
+	/* Allocate memory buffer for Send Queue WQEs. */
+	if (sh->txpp.test) {
+		wq->sq_size = RTE_ALIGN(MLX5_TXPP_TEST_PKT_SIZE +
+					MLX5_WQE_CSEG_SIZE +
+					2 * MLX5_WQE_ESEG_SIZE -
+					MLX5_ESEG_MIN_INLINE_SIZE,
+					MLX5_WQE_SIZE) / MLX5_WQE_SIZE;
+		wq->sq_size *= MLX5_TXPP_CLKQ_SIZE;
+	} else {
+		wq->sq_size = MLX5_TXPP_CLKQ_SIZE;
+	}
+	/* There should not be WQE leftovers in the cyclic queue. */
+	MLX5_ASSERT(wq->sq_size == (1 << log2above(wq->sq_size)));
+	umem_size =  MLX5_WQE_SIZE * wq->sq_size;
+	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+	umem_size += MLX5_DBR_SIZE;
+	wq->sq_buf = rte_zmalloc_socket(__func__, umem_size,
+					page_size, sh->numa_node);
+	if (!wq->sq_buf) {
+		DRV_LOG(ERR, "Failed to allocate memory for Clock Queue.");
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	/* Register allocated buffer in user space with DevX. */
+	wq->sq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
+					       (void *)(uintptr_t)wq->sq_buf,
+					       umem_size,
+					       IBV_ACCESS_LOCAL_WRITE);
+	if (!wq->sq_umem) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to register umem for Clock Queue.");
+		goto error;
+	}
+	/* Create send queue object for Clock Queue. */
+	if (sh->txpp.test) {
+		sq_attr.tis_lst_sz = 1;
+		sq_attr.tis_num = sh->tis->id;
+		sq_attr.non_wire = 0;
+		sq_attr.static_sq_wq = 1;
+	} else {
+		sq_attr.non_wire = 1;
+		sq_attr.static_sq_wq = 1;
+	}
+	sq_attr.state = MLX5_SQC_STATE_RST;
+	sq_attr.cqn = wq->cq->id;
+	sq_attr.wq_attr.cd_slave = 1;
+	sq_attr.wq_attr.uar_page = sh->tx_uar->page_id;
+	sq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
+	sq_attr.wq_attr.pd = sh->pdn;
+	sq_attr.wq_attr.log_wq_stride = rte_log2_u32(MLX5_WQE_SIZE);
+	sq_attr.wq_attr.log_wq_sz = rte_log2_u32(wq->sq_size);
+	sq_attr.wq_attr.dbr_umem_valid = 1;
+	sq_attr.wq_attr.dbr_addr = umem_dbrec;
+	sq_attr.wq_attr.dbr_umem_id = wq->sq_umem->umem_id;
+	sq_attr.wq_attr.wq_umem_valid = 1;
+	sq_attr.wq_attr.wq_umem_id = wq->sq_umem->umem_id;
+	/* umem_offset must be zero for static_sq_wq queue. */
+	sq_attr.wq_attr.wq_umem_offset = 0;
+	wq->sq = mlx5_devx_cmd_create_sq(sh->ctx, &sq_attr);
+	if (!wq->sq) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create SQ for Clock Queue.");
+		goto error;
+	}
+	wq->sq_dbrec = RTE_PTR_ADD(wq->sq_buf, umem_dbrec +
+				   MLX5_SND_DBR * sizeof(uint32_t));
+	/* Build the WQEs in the Send Queue before goto Ready state. */
+	mlx5_txpp_fill_wqe_clock_queue(sh);
+	/* Change queue state to ready. */
+	msq_attr.sq_state = MLX5_SQC_STATE_RST;
+	msq_attr.state = MLX5_SQC_STATE_RDY;
+	wq->sq_ci = 0;
+	ret = mlx5_devx_cmd_modify_sq(wq->sq, &msq_attr);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to set SQ ready state Clock Queue.");
+		goto error;
+	}
+	return 0;
+error:
+	ret = -rte_errno;
+	mlx5_txpp_destroy_clock_queue(sh);
+	rte_errno = -ret;
+	return ret;
+}
+
+/*
+ * The routine initializes the packet pacing infrastructure:
+ * - allocates PP context
+ * - Clock CQ/SQ
+ * - Rearm CQ/SQ
+ * - attaches rearm interrupt handler
+ *
+ * Returns 0 on success, negative otherwise
+ */
+static int
+mlx5_txpp_create(struct mlx5_dev_ctx_shared *sh, struct mlx5_priv *priv)
+{
+	int tx_pp = priv->config.tx_pp;
+	int ret;
+
+	/* Store the requested pacing parameters. */
+	sh->txpp.tick = tx_pp >= 0 ? tx_pp : -tx_pp;
+	sh->txpp.test = !!(tx_pp < 0);
+	sh->txpp.skew = priv->config.tx_skew;
+	sh->txpp.freq = priv->config.hca_attr.dev_freq_khz;
+	ret = mlx5_txpp_create_eqn(sh);
+	if (ret)
+		goto exit;
+	ret = mlx5_txpp_create_clock_queue(sh);
+	if (ret)
+		goto exit;
+exit:
+	if (ret) {
+		mlx5_txpp_destroy_clock_queue(sh);
+		mlx5_txpp_destroy_eqn(sh);
+		sh->txpp.tick = 0;
+		sh->txpp.test = 0;
+		sh->txpp.skew = 0;
+	}
+	return ret;
+}
+
+/*
+ * The routine destroys the packet pacing infrastructure:
+ * - detaches rearm interrupt handler
+ * - Rearm CQ/SQ
+ * - Clock CQ/SQ
+ * - PP context
+ */
+static void
+mlx5_txpp_destroy(struct mlx5_dev_ctx_shared *sh)
+{
+	mlx5_txpp_destroy_clock_queue(sh);
+	mlx5_txpp_destroy_eqn(sh);
+	sh->txpp.tick = 0;
+	sh->txpp.test = 0;
+	sh->txpp.skew = 0;
+}
+
+/**
+ * Creates and starts packet pacing infrastructure on specified device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_txpp_start(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	int err = 0;
+	int ret;
+
+	if (!priv->config.tx_pp) {
+		/* Packet pacing is not requested for the device. */
+		MLX5_ASSERT(priv->txpp_en == 0);
+		return 0;
+	}
+	if (priv->txpp_en) {
+		/* Packet pacing is already enabled for the device. */
+		MLX5_ASSERT(sh->txpp.refcnt);
+		return 0;
+	}
+	ret = rte_mbuf_dynflag_lookup(RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL);
+	if (ret < 0)
+		return 0;
+	ret = pthread_mutex_lock(&sh->txpp.mutex);
+	MLX5_ASSERT(!ret);
+	RTE_SET_USED(ret);
+	if (sh->txpp.refcnt) {
+		priv->txpp_en = 1;
+		++sh->txpp.refcnt;
+	} else {
+		err = mlx5_txpp_create(sh, priv);
+		if (!err) {
+			MLX5_ASSERT(sh->txpp.tick);
+			priv->txpp_en = 1;
+			sh->txpp.refcnt = 1;
+		} else {
+			rte_errno = -err;
+		}
+	}
+	ret = pthread_mutex_unlock(&sh->txpp.mutex);
+	MLX5_ASSERT(!ret);
+	RTE_SET_USED(ret);
+	return err;
+}
+
+/**
+ * Stops and destroys packet pacing infrastructure on specified device.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+void
+mlx5_txpp_stop(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	int ret;
+
+	if (!priv->txpp_en) {
+		/* Packet pacing is already disabled for the device. */
+		return;
+	}
+	priv->txpp_en = 0;
+	ret = pthread_mutex_lock(&sh->txpp.mutex);
+	MLX5_ASSERT(!ret);
+	RTE_SET_USED(ret);
+	MLX5_ASSERT(sh->txpp.refcnt);
+	if (!sh->txpp.refcnt || --sh->txpp.refcnt)
+		return;
+	/* No references any more, do actual destroy. */
+	mlx5_txpp_destroy(sh);
+	ret = pthread_mutex_unlock(&sh->txpp.mutex);
+	MLX5_ASSERT(!ret);
+	RTE_SET_USED(ret);
+}
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 06/16] net/mlx5: create rearm queue for packet pacing
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (3 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 05/16] net/mlx5: create clock queue for packet pacing Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 07/16] net/mlx5: create Tx queues with DevX Viacheslav Ovsiienko
                   ` (9 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

The dedicated Rearm Queue is needed to fire the work requests to
the Clock Queue in realtime. The Clock Queue should never stop,
otherwise the clock synchronization mignt be broken and packet
send scheduling would fail. The Rearm Queue uses cross channel
SEND_EN/WAIT operations to provides the requests to the
CLock Queue in robust way.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.h      |   1 +
 drivers/net/mlx5/mlx5_defs.h |   5 +-
 drivers/net/mlx5/mlx5_txpp.c | 203 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 205 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index be28d80..a1956cc 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -567,6 +567,7 @@ struct mlx5_dev_txpp {
 	struct rte_intr_handle intr_handle; /* Periodic interrupt. */
 	struct mlx5dv_devx_event_channel *echan; /* Event Channel. */
 	struct mlx5_txpp_wq clock_queue; /* Clock Queue. */
+	struct mlx5_txpp_wq rearm_queue; /* Clock Queue. */
 };
 
 /*
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index fff11af..35f02cb 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -173,11 +173,14 @@
 
 /* Tx accurate scheduling on timestamps parameters. */
 #define MLX5_TXPP_CLKQ_SIZE 1
+#define MLX5_TXPP_REARM	((1UL << MLX5_WQ_INDEX_WIDTH) / 4)
+#define MLX5_TXPP_REARM_SQ_SIZE (((1UL << MLX5_CQ_INDEX_WIDTH) / \
+				  MLX5_TXPP_REARM) * 2)
+#define MLX5_TXPP_REARM_CQ_SIZE (MLX5_TXPP_REARM_SQ_SIZE / 2)
 /* The minimal size test packet to put into one WQE, padded by HW. */
 #define MLX5_TXPP_TEST_PKT_SIZE	(sizeof(struct rte_ether_hdr) +	\
 				 sizeof(struct rte_ipv4_hdr))
 
-
 /* Size of the simple hash table for metadata register table. */
 #define MLX5_FLOW_MREG_HTABLE_SZ 4096
 #define MLX5_FLOW_MREG_HNAME "MARK_COPY_TABLE"
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 7f8a6c4..34ac493 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -9,6 +9,7 @@
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
+#include "mlx5_common_os.h"
 
 /* Destroy Event Queue Notification Channel. */
 static void
@@ -48,10 +49,8 @@
 }
 
 static void
-mlx5_txpp_destroy_clock_queue(struct mlx5_dev_ctx_shared *sh)
+mlx5_txpp_destroy_send_queue(struct mlx5_txpp_wq *wq)
 {
-	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
-
 	if (wq->sq)
 		claim_zero(mlx5_devx_cmd_destroy(wq->sq));
 	if (wq->sq_umem)
@@ -68,6 +67,199 @@
 }
 
 static void
+mlx5_txpp_destroy_rearm_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+
+	mlx5_txpp_destroy_send_queue(wq);
+}
+
+static void
+mlx5_txpp_destroy_clock_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+
+	mlx5_txpp_destroy_send_queue(wq);
+}
+
+static void
+mlx5_txpp_fill_cqe_rearm_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+	struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
+	uint32_t i;
+
+	for (i = 0; i < MLX5_TXPP_REARM_CQ_SIZE; i++) {
+		cqe->op_own = (MLX5_CQE_INVALID << 4) | MLX5_CQE_OWNER_MASK;
+		++cqe;
+	}
+}
+
+static void
+mlx5_txpp_fill_wqe_rearm_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+	struct mlx5_wqe *wqe = (struct mlx5_wqe *)(uintptr_t)wq->wqes;
+	uint32_t i;
+
+	for (i = 0; i < wq->sq_size; i += 2) {
+		struct mlx5_wqe_cseg *cs;
+		struct mlx5_wqe_qseg *qs;
+		uint32_t index;
+
+		/* Build SEND_EN request with slave WQE index. */
+		cs = &wqe[i + 0].cseg;
+		cs->opcode = RTE_BE32(MLX5_OPCODE_SEND_EN | 0);
+		cs->sq_ds = rte_cpu_to_be_32((wq->sq->id << 8) | 2);
+		cs->flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+				     MLX5_COMP_MODE_OFFSET);
+		cs->misc = RTE_BE32(0);
+		qs = RTE_PTR_ADD(cs, sizeof(struct mlx5_wqe_cseg));
+		index = (i * MLX5_TXPP_REARM / 2 + MLX5_TXPP_REARM) &
+			((1 << MLX5_WQ_INDEX_WIDTH) - 1);
+		qs->max_index = rte_cpu_to_be_32(index);
+		qs->qpn_cqn = rte_cpu_to_be_32(sh->txpp.clock_queue.sq->id);
+		/* Build WAIT request with slave CQE index. */
+		cs = &wqe[i + 1].cseg;
+		cs->opcode = RTE_BE32(MLX5_OPCODE_WAIT | 0);
+		cs->sq_ds = rte_cpu_to_be_32((wq->sq->id << 8) | 2);
+		cs->flags = RTE_BE32(MLX5_COMP_ONLY_ERR <<
+				     MLX5_COMP_MODE_OFFSET);
+		cs->misc = RTE_BE32(0);
+		qs = RTE_PTR_ADD(cs, sizeof(struct mlx5_wqe_cseg));
+		index = (i * MLX5_TXPP_REARM / 2 + MLX5_TXPP_REARM / 2) &
+			((1 << MLX5_CQ_INDEX_WIDTH) - 1);
+		qs->max_index = rte_cpu_to_be_32(index);
+		qs->qpn_cqn = rte_cpu_to_be_32(sh->txpp.clock_queue.cq->id);
+	}
+}
+
+/* Creates the Rearm Queue to fire the requests to Clock Queue in realtime. */
+static int
+mlx5_txpp_create_rearm_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_devx_create_sq_attr sq_attr = { 0 };
+	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
+	struct mlx5_devx_cq_attr cq_attr = { 0 };
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+	size_t page_size = sysconf(_SC_PAGESIZE);
+	uint32_t umem_size, umem_dbrec;
+	int ret;
+
+	/* Allocate memory buffer for CQEs and doorbell record. */
+	umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_REARM_CQ_SIZE;
+	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+	umem_size += MLX5_DBR_SIZE;
+	wq->cq_buf = rte_zmalloc_socket(__func__, umem_size,
+					page_size, sh->numa_node);
+	if (!wq->cq_buf) {
+		DRV_LOG(ERR, "Failed to allocate memory for Rearm Queue.");
+		return -ENOMEM;
+	}
+	/* Register allocated buffer in user space with DevX. */
+	wq->cq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
+					       (void *)(uintptr_t)wq->cq_buf,
+					       umem_size,
+					       IBV_ACCESS_LOCAL_WRITE);
+	if (!wq->cq_umem) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to register umem for Rearm Queue.");
+		goto error;
+	}
+	/* Create completion queue object for Rearm Queue. */
+	cq_attr.cqe_size = (sizeof(struct mlx5_cqe) == 128) ?
+			    MLX5_CQE_SIZE_128B : MLX5_CQE_SIZE_64B;
+	cq_attr.uar_page_id = sh->tx_uar->page_id;
+	cq_attr.eqn = sh->txpp.eqn;
+	cq_attr.q_umem_valid = 1;
+	cq_attr.q_umem_offset = 0;
+	cq_attr.q_umem_id = mlx5_os_get_umem_id(wq->cq_umem);
+	cq_attr.db_umem_valid = 1;
+	cq_attr.db_umem_offset = umem_dbrec;
+	cq_attr.db_umem_id = mlx5_os_get_umem_id(wq->cq_umem);
+	cq_attr.log_cq_size = rte_log2_u32(MLX5_TXPP_REARM_CQ_SIZE);
+	cq_attr.log_page_size = rte_log2_u32(page_size);
+	wq->cq = mlx5_devx_cmd_create_cq(sh->ctx, &cq_attr);
+	if (!wq->cq) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create CQ for Rearm Queue.");
+		goto error;
+	}
+	wq->cq_dbrec = RTE_PTR_ADD(wq->cq_buf, umem_dbrec);
+	wq->cq_ci = 0;
+	wq->arm_sn = 0;
+	/* Mark all CQEs initially as invalid. */
+	mlx5_txpp_fill_cqe_rearm_queue(sh);
+	/*
+	 * Allocate memory buffer for Send Queue WQEs.
+	 * There should be no WQE leftovers in the cyclic queue.
+	 */
+	wq->sq_size = MLX5_TXPP_REARM_SQ_SIZE;
+	MLX5_ASSERT(wq->sq_size == (1 << log2above(wq->sq_size)));
+	umem_size =  MLX5_WQE_SIZE * wq->sq_size;
+	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
+	umem_size += MLX5_DBR_SIZE;
+	wq->sq_buf = rte_zmalloc_socket(__func__, umem_size,
+					page_size, sh->numa_node);
+	if (!wq->sq_buf) {
+		DRV_LOG(ERR, "Failed to allocate memory for Rearm Queue.");
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	/* Register allocated buffer in user space with DevX. */
+	wq->sq_umem = mlx5_glue->devx_umem_reg(sh->ctx,
+					       (void *)(uintptr_t)wq->sq_buf,
+					       umem_size,
+					       IBV_ACCESS_LOCAL_WRITE);
+	if (!wq->sq_umem) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to register umem for Rearm Queue.");
+		goto error;
+	}
+	/* Create send queue object for Rearm Queue. */
+	sq_attr.state = MLX5_SQC_STATE_RST;
+	sq_attr.tis_lst_sz = 1;
+	sq_attr.tis_num = sh->tis->id;
+	sq_attr.cqn = wq->cq->id;
+	sq_attr.cd_master = 1;
+	sq_attr.wq_attr.uar_page = sh->tx_uar->page_id;
+	sq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
+	sq_attr.wq_attr.pd = sh->pdn;
+	sq_attr.wq_attr.log_wq_stride = rte_log2_u32(MLX5_WQE_SIZE);
+	sq_attr.wq_attr.log_wq_sz = rte_log2_u32(wq->sq_size);
+	sq_attr.wq_attr.dbr_umem_valid = 1;
+	sq_attr.wq_attr.dbr_addr = umem_dbrec;
+	sq_attr.wq_attr.dbr_umem_id = mlx5_os_get_umem_id(wq->sq_umem);
+	sq_attr.wq_attr.wq_umem_valid = 1;
+	sq_attr.wq_attr.wq_umem_id = mlx5_os_get_umem_id(wq->sq_umem);
+	sq_attr.wq_attr.wq_umem_offset = 0;
+	wq->sq = mlx5_devx_cmd_create_sq(sh->ctx, &sq_attr);
+	if (!wq->sq) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create SQ for Rearm Queue.");
+		goto error;
+	}
+	wq->sq_dbrec = RTE_PTR_ADD(wq->sq_buf, umem_dbrec +
+				   MLX5_SND_DBR * sizeof(uint32_t));
+	/* Build the WQEs in the Send Queue before goto Ready state. */
+	mlx5_txpp_fill_wqe_rearm_queue(sh);
+	/* Change queue state to ready. */
+	msq_attr.sq_state = MLX5_SQC_STATE_RST;
+	msq_attr.state = MLX5_SQC_STATE_RDY;
+	ret = mlx5_devx_cmd_modify_sq(wq->sq, &msq_attr);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to set SQ ready state Rearm Queue.");
+		goto error;
+	}
+	return 0;
+error:
+	ret = -rte_errno;
+	mlx5_txpp_destroy_rearm_queue(sh);
+	rte_errno = -ret;
+	return ret;
+}
+
+static void
 mlx5_txpp_fill_wqe_clock_queue(struct mlx5_dev_ctx_shared *sh)
 {
 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
@@ -331,8 +523,12 @@
 	ret = mlx5_txpp_create_clock_queue(sh);
 	if (ret)
 		goto exit;
+	ret = mlx5_txpp_create_rearm_queue(sh);
+	if (ret)
+		goto exit;
 exit:
 	if (ret) {
+		mlx5_txpp_destroy_rearm_queue(sh);
 		mlx5_txpp_destroy_clock_queue(sh);
 		mlx5_txpp_destroy_eqn(sh);
 		sh->txpp.tick = 0;
@@ -352,6 +548,7 @@
 static void
 mlx5_txpp_destroy(struct mlx5_dev_ctx_shared *sh)
 {
+	mlx5_txpp_destroy_rearm_queue(sh);
 	mlx5_txpp_destroy_clock_queue(sh);
 	mlx5_txpp_destroy_eqn(sh);
 	sh->txpp.tick = 0;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 07/16] net/mlx5: create Tx queues with DevX
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (4 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 06/16] net/mlx5: create rearm " Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 08/16] net/mlx5: allocate packet pacing context Viacheslav Ovsiienko
                   ` (8 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

To provide the packet send schedule on mbuf timestamp the Tx
queue must be attached to the same UAR as Clock Queue is.
UAR is special hardware related resource mapped to the host
memory and provides doorbell registers, the assigning UAR
to the queue being created is provided via DevX API only.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c    | 108 ++++++++++-----
 drivers/net/mlx5/mlx5_rxtx.h    |  14 ++
 drivers/net/mlx5/mlx5_trigger.c |   6 +-
 drivers/net/mlx5/mlx5_txq.c     | 299 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 386 insertions(+), 41 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e4106bf..c456d20 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -944,43 +944,79 @@ enum mlx5_txcmp_code {
 		struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
 		struct mlx5_txq_ctrl *txq_ctrl =
 			container_of(txq, struct mlx5_txq_ctrl, txq);
-		struct ibv_qp_attr mod = {
-			.qp_state = IBV_QPS_RESET,
-			.port_num = (uint8_t)priv->dev_port,
-		};
-		struct ibv_qp *qp = txq_ctrl->obj->qp;
 
-		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
-		if (ret) {
-			DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
-				"%s", strerror(errno));
-			rte_errno = errno;
-			return ret;
-		}
-		mod.qp_state = IBV_QPS_INIT;
-		ret = mlx5_glue->modify_qp(qp, &mod,
-					   (IBV_QP_STATE | IBV_QP_PORT));
-		if (ret) {
-			DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s",
-				strerror(errno));
-			rte_errno = errno;
-			return ret;
-		}
-		mod.qp_state = IBV_QPS_RTR;
-		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
-		if (ret) {
-			DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s",
-				strerror(errno));
-			rte_errno = errno;
-			return ret;
-		}
-		mod.qp_state = IBV_QPS_RTS;
-		ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
-		if (ret) {
-			DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s",
-				strerror(errno));
-			rte_errno = errno;
-			return ret;
+		if (txq_ctrl->obj->type == MLX5_TXQ_OBJ_TYPE_DEVX_SQ) {
+			struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
+
+			/* Change queue state to reset. */
+			msq_attr.sq_state = MLX5_SQC_STATE_ERR;
+			msq_attr.state = MLX5_SQC_STATE_RST;
+			ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq_devx,
+						      &msq_attr);
+			if (ret) {
+				DRV_LOG(ERR, "Cannot change the "
+					"Tx QP state to RESET %s",
+					strerror(errno));
+				rte_errno = errno;
+				return ret;
+			}
+			/* Change queue state to ready. */
+			msq_attr.sq_state = MLX5_SQC_STATE_RST;
+			msq_attr.state = MLX5_SQC_STATE_RDY;
+			ret = mlx5_devx_cmd_modify_sq(txq_ctrl->obj->sq_devx,
+						      &msq_attr);
+			if (ret) {
+				DRV_LOG(ERR, "Cannot change the "
+					"Tx QP state to READY %s",
+					strerror(errno));
+				rte_errno = errno;
+				return ret;
+			}
+		} else {
+			struct ibv_qp_attr mod = {
+				.qp_state = IBV_QPS_RESET,
+				.port_num = (uint8_t)priv->dev_port,
+			};
+			struct ibv_qp *qp = txq_ctrl->obj->qp;
+
+			MLX5_ASSERT
+				(txq_ctrl->obj->type == MLX5_TXQ_OBJ_TYPE_IBV);
+
+			ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+			if (ret) {
+				DRV_LOG(ERR, "Cannot change the "
+					"Tx QP state to RESET %s",
+					strerror(errno));
+				rte_errno = errno;
+				return ret;
+			}
+			mod.qp_state = IBV_QPS_INIT;
+			ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+			if (ret) {
+				DRV_LOG(ERR, "Cannot change the "
+					"Tx QP state to INIT %s",
+					strerror(errno));
+				rte_errno = errno;
+				return ret;
+			}
+			mod.qp_state = IBV_QPS_RTR;
+			ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+			if (ret) {
+				DRV_LOG(ERR, "Cannot change the "
+					"Tx QP state to RTR %s",
+					strerror(errno));
+				rte_errno = errno;
+				return ret;
+			}
+			mod.qp_state = IBV_QPS_RTS;
+			ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
+			if (ret) {
+				DRV_LOG(ERR, "Cannot change the "
+					"Tx QP state to RTS %s",
+					strerror(errno));
+				rte_errno = errno;
+				return ret;
+			}
 		}
 	}
 	return 0;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 26621ff..1b797da 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -323,6 +323,7 @@ struct mlx5_txq_data {
 
 enum mlx5_txq_obj_type {
 	MLX5_TXQ_OBJ_TYPE_IBV,		/* mlx5_txq_obj with ibv_wq. */
+	MLX5_TXQ_OBJ_TYPE_DEVX_SQ,	/* mlx5_txq_obj with mlx5_devx_sq. */
 	MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN,
 	/* mlx5_txq_obj with mlx5_devx_tq and hairpin support. */
 };
@@ -349,6 +350,19 @@ struct mlx5_txq_obj {
 			/* DevX object for Sx queue. */
 			struct mlx5_devx_obj *tis; /* The TIS object. */
 		};
+		struct {
+			struct rte_eth_dev *dev;
+			struct mlx5_devx_obj *cq_devx;
+			struct mlx5dv_devx_umem *cq_umem;
+			void *cq_buf;
+			int64_t cq_dbrec_offset;
+			struct mlx5_devx_dbr_page *cq_dbrec_page;
+			struct mlx5_devx_obj *sq_devx;
+			struct mlx5dv_devx_umem *sq_umem;
+			void *sq_buf;
+			int64_t sq_dbrec_offset;
+			struct mlx5_devx_dbr_page *sq_dbrec_page;
+		};
 	};
 };
 
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index ca25ad9..449dd95 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -56,10 +56,12 @@
 		if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
 			txq_ctrl->obj = mlx5_txq_obj_new
 				(dev, i, MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN);
-		} else {
+		} else  {
 			txq_alloc_elts(txq_ctrl);
 			txq_ctrl->obj = mlx5_txq_obj_new
-				(dev, i, MLX5_TXQ_OBJ_TYPE_IBV);
+				(dev, i, priv->txpp_en ?
+				MLX5_TXQ_OBJ_TYPE_DEVX_SQ :
+				MLX5_TXQ_OBJ_TYPE_IBV);
 		}
 		if (!txq_ctrl->obj) {
 			rte_errno = ENOMEM;
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index e1fa24e..a6f7e1c 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -31,6 +31,7 @@
 #include <mlx5_devx_cmds.h>
 #include <mlx5_common.h>
 #include <mlx5_common_mr.h>
+#include <mlx5_common_os.h>
 
 #include "mlx5_defs.h"
 #include "mlx5_utils.h"
@@ -573,6 +574,290 @@
 }
 
 /**
+ * Destroy the Tx queue DevX object.
+ *
+ * @param txq_obj
+ *   Txq object to destroy
+ */
+static void
+txq_release_sq_resources(struct mlx5_txq_obj *txq_obj)
+{
+	MLX5_ASSERT(txq_obj->type == MLX5_TXQ_OBJ_TYPE_DEVX_SQ);
+
+	if (txq_obj->sq_devx)
+		claim_zero(mlx5_devx_cmd_destroy(txq_obj->sq_devx));
+	if (txq_obj->sq_dbrec_page)
+		claim_zero(mlx5_release_dbr
+				(&txq_obj->txq_ctrl->priv->dbrpgs,
+				mlx5_os_get_umem_id
+					(txq_obj->sq_dbrec_page->umem),
+				txq_obj->sq_dbrec_offset));
+	if (txq_obj->sq_umem)
+		claim_zero(mlx5_glue->devx_umem_dereg(txq_obj->sq_umem));
+	if (txq_obj->sq_buf)
+		rte_free(txq_obj->sq_buf);
+	if (txq_obj->cq_devx)
+		claim_zero(mlx5_devx_cmd_destroy(txq_obj->cq_devx));
+	if (txq_obj->cq_dbrec_page)
+		claim_zero(mlx5_release_dbr
+				(&txq_obj->txq_ctrl->priv->dbrpgs,
+				mlx5_os_get_umem_id
+					(txq_obj->cq_dbrec_page->umem),
+				txq_obj->cq_dbrec_offset));
+	if (txq_obj->cq_umem)
+		claim_zero(mlx5_glue->devx_umem_dereg(txq_obj->cq_umem));
+	if (txq_obj->cq_buf)
+		rte_free(txq_obj->cq_buf);
+}
+
+/**
+ * Create the Tx queue DevX object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Tx queue array
+ *
+ * @return
+ *   The DevX object initialised, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_txq_obj *
+mlx5_txq_obj_devx_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq_data, struct mlx5_txq_ctrl, txq);
+	struct mlx5_devx_create_sq_attr sq_attr = { 0 };
+	struct mlx5_devx_modify_sq_attr msq_attr = { 0 };
+	struct mlx5_devx_cq_attr cq_attr = { 0 };
+	struct mlx5_txq_obj *txq_obj = NULL;
+	size_t page_size = sysconf(_SC_PAGESIZE);
+	struct mlx5_cqe *cqe;
+	uint32_t i, nqe;
+	int ret = 0;
+
+	MLX5_ASSERT(txq_data);
+	MLX5_ASSERT(!txq_ctrl->obj);
+	txq_obj = rte_calloc_socket(__func__, 1,
+				    sizeof(struct mlx5_txq_obj), 0,
+				    txq_ctrl->socket);
+	if (!txq_obj) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u cannot allocate memory resources",
+			dev->data->port_id, txq_data->idx);
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	txq_obj->type = MLX5_TXQ_OBJ_TYPE_DEVX_SQ;
+	txq_obj->txq_ctrl = txq_ctrl;
+	txq_obj->dev = dev;
+	/* Create the Completion Queue. */
+	nqe = (1UL << txq_data->elts_n) / MLX5_TX_COMP_THRESH +
+	       1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
+	nqe = 1UL << log2above(nqe);
+	if (nqe > UINT16_MAX) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u requests to many CQEs %u",
+			dev->data->port_id, txq_data->idx, nqe);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	/* Allocate memory buffer for CQEs. */
+	txq_obj->cq_buf = rte_zmalloc_socket(__func__,
+					     nqe * sizeof(struct mlx5_cqe),
+					     MLX5_CQE_BUF_ALIGNMENT,
+					     sh->numa_node);
+	if (!txq_obj->cq_buf) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u cannot allocate memory (CQ)",
+			dev->data->port_id, txq_data->idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	txq_data->cqe_n = log2above(nqe);
+	txq_data->cqe_s = 1 << txq_data->cqe_n;
+	txq_data->cqe_m = txq_data->cqe_s - 1;
+	txq_data->cqes = (volatile struct mlx5_cqe *)txq_obj->cq_buf;
+	txq_data->cq_ci = 0;
+	txq_data->cq_pi = 0;
+	/* Register allocated buffer in user space with DevX. */
+	txq_obj->cq_umem = mlx5_glue->devx_umem_reg
+					(sh->ctx,
+					 (void *)txq_obj->cq_buf,
+					 nqe * sizeof(struct mlx5_cqe),
+					 IBV_ACCESS_LOCAL_WRITE);
+	if (!txq_obj->cq_umem) {
+		rte_errno = errno;
+		DRV_LOG(ERR,
+			"port %u Tx queue %u cannot register memory (CQ)",
+			dev->data->port_id, txq_data->idx);
+		goto error;
+	}
+	/* Allocate doorbell record for completion queue. */
+	txq_obj->cq_dbrec_offset = mlx5_get_dbr(sh->ctx,
+						&priv->dbrpgs,
+						&txq_obj->cq_dbrec_page);
+	if (txq_obj->cq_dbrec_offset < 0)
+		goto error;
+	txq_data->cq_db = (volatile uint32_t *)(txq_obj->cq_dbrec_page->dbrs +
+						txq_obj->cq_dbrec_offset);
+	*txq_data->cq_db = 0;
+	/* Create completion queue object with DevX. */
+	cq_attr.cqe_size = (sizeof(struct mlx5_cqe) == 128) ?
+			    MLX5_CQE_SIZE_128B : MLX5_CQE_SIZE_64B;
+	cq_attr.uar_page_id = sh->tx_uar->page_id;
+	cq_attr.eqn = sh->txpp.eqn;
+	cq_attr.q_umem_valid = 1;
+	cq_attr.q_umem_offset = (uintptr_t)txq_obj->cq_buf % page_size;
+	cq_attr.q_umem_id = txq_obj->cq_umem->umem_id;
+	cq_attr.db_umem_valid = 1;
+	cq_attr.db_umem_offset = txq_obj->cq_dbrec_offset;
+	cq_attr.db_umem_id = mlx5_os_get_umem_id(txq_obj->cq_dbrec_page->umem);
+	cq_attr.log_cq_size = rte_log2_u32(nqe);
+	cq_attr.log_page_size = rte_log2_u32(page_size);
+	txq_obj->cq_devx = mlx5_devx_cmd_create_cq(sh->ctx, &cq_attr);
+	if (!txq_obj->cq_devx) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "port %u Tx queue %u CQ creation failure",
+			dev->data->port_id, idx);
+		goto error;
+	}
+	/* Initial fill CQ buffer with invalid CQE opcode. */
+	cqe = (struct mlx5_cqe *)txq_obj->cq_buf;
+	for (i = 0; i < txq_data->cqe_s; i++) {
+		cqe->op_own = (MLX5_CQE_INVALID << 4) | MLX5_CQE_OWNER_MASK;
+		++cqe;
+	}
+	/* Create the Work Queue. */
+	nqe = RTE_MIN(1UL << txq_data->elts_n,
+		      (uint32_t)sh->device_attr.max_qp_wr);
+	txq_obj->sq_buf = rte_zmalloc_socket(__func__,
+					     nqe * sizeof(struct mlx5_wqe),
+					     page_size,
+					     sh->numa_node);
+	if (!txq_obj->sq_buf) {
+		DRV_LOG(ERR,
+			"port %u Tx queue %u cannot allocate memory (SQ)",
+			dev->data->port_id, txq_data->idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	txq_data->wqe_n = log2above(nqe);
+	txq_data->wqe_s = 1 << txq_data->wqe_n;
+	txq_data->wqe_m = txq_data->wqe_s - 1;
+	txq_data->wqes = (struct mlx5_wqe *)txq_obj->sq_buf;
+	txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
+	txq_data->wqe_ci = 0;
+	txq_data->wqe_pi = 0;
+	txq_data->wqe_comp = 0;
+	txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
+	/* Register allocated buffer in user space with DevX. */
+	txq_obj->sq_umem = mlx5_glue->devx_umem_reg
+					(sh->ctx,
+					 (void *)txq_obj->sq_buf,
+					 nqe * sizeof(struct mlx5_wqe),
+					 IBV_ACCESS_LOCAL_WRITE);
+	if (!txq_obj->sq_umem) {
+		rte_errno = errno;
+		DRV_LOG(ERR,
+			"port %u Tx queue %u cannot register memory (SQ)",
+			dev->data->port_id, txq_data->idx);
+		goto error;
+	}
+	/* Allocate doorbell record for completion queue. */
+	txq_obj->cq_dbrec_offset = mlx5_get_dbr(sh->ctx,
+						&priv->dbrpgs,
+						&txq_obj->sq_dbrec_page);
+	if (txq_obj->sq_dbrec_offset < 0)
+		goto error;
+	txq_data->qp_db = (volatile uint32_t *)
+					(txq_obj->sq_dbrec_page->dbrs +
+					 txq_obj->sq_dbrec_offset +
+					 MLX5_SND_DBR * sizeof(uint32_t));
+	*txq_data->qp_db = 0;
+	/* Create Send Queue object with DevX. */
+	sq_attr.tis_lst_sz = 1;
+	sq_attr.tis_num = sh->tis->id;
+	sq_attr.state = MLX5_SQC_STATE_RST;
+	sq_attr.cqn = txq_obj->cq_devx->id;
+	sq_attr.flush_in_error_en = 1;
+	sq_attr.allow_multi_pkt_send_wqe = !!priv->config.mps;
+	sq_attr.allow_swp = !!priv->config.swp;
+	sq_attr.min_wqe_inline_mode = priv->config.hca_attr.vport_inline_mode;
+	sq_attr.wq_attr.uar_page = sh->tx_uar->page_id;
+	sq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
+	sq_attr.wq_attr.pd = sh->pdn;
+	sq_attr.wq_attr.log_wq_stride = rte_log2_u32(MLX5_WQE_SIZE);
+	sq_attr.wq_attr.log_wq_sz = txq_data->wqe_n;
+	sq_attr.wq_attr.dbr_umem_valid = 1;
+	sq_attr.wq_attr.dbr_addr = txq_obj->cq_dbrec_offset;
+	sq_attr.wq_attr.dbr_umem_id =
+			mlx5_os_get_umem_id(txq_obj->cq_dbrec_page->umem);
+	sq_attr.wq_attr.wq_umem_valid = 1;
+	sq_attr.wq_attr.wq_umem_id = txq_obj->sq_umem->umem_id;
+	sq_attr.wq_attr.wq_umem_offset = (uintptr_t)txq_obj->sq_buf % page_size;
+	txq_obj->sq_devx = mlx5_devx_cmd_create_sq(sh->ctx, &sq_attr);
+	if (!txq_obj->sq_devx) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "port %u Tx queue %u SQ creation failure",
+			dev->data->port_id, idx);
+		goto error;
+	}
+	txq_data->qp_num_8s = txq_obj->sq_devx->id << 8;
+	/* Change Send Queue state to Ready-to-Send. */
+	msq_attr.sq_state = MLX5_SQC_STATE_RST;
+	msq_attr.state = MLX5_SQC_STATE_RDY;
+	ret = mlx5_devx_cmd_modify_sq(txq_obj->sq_devx, &msq_attr);
+	if (ret) {
+		rte_errno = errno;
+		DRV_LOG(ERR,
+			"port %u Tx queue %u SP state to SQC_STATE_RDY failed",
+			dev->data->port_id, idx);
+		goto error;
+	}
+	txq_data->fcqs = rte_calloc_socket(__func__,
+					   txq_data->cqe_s,
+					   sizeof(*txq_data->fcqs),
+					   RTE_CACHE_LINE_SIZE,
+					   txq_ctrl->socket);
+	if (!txq_data->fcqs) {
+		DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory (FCQ)",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/*
+	 * If using DevX need to query and store TIS transport domain value.
+	 * This is done once per port.
+	 * Will use this value on Rx, when creating matching TIR.
+	 */
+	if (priv->config.devx && !priv->sh->tdn)
+		priv->sh->tdn = priv->sh->td->id;
+#endif
+	MLX5_ASSERT(sh->tx_uar);
+	MLX5_ASSERT(sh->tx_uar->reg_addr);
+	txq_ctrl->bf_reg = sh->tx_uar->reg_addr;
+	txq_ctrl->uar_mmap_offset = sh->tx_uar->mmap_off;
+	rte_atomic32_set(&txq_obj->refcnt, 1);
+	txq_uar_init(txq_ctrl);
+	LIST_INSERT_HEAD(&priv->txqsobj, txq_obj, next);
+	return txq_obj;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	txq_release_sq_resources(txq_obj);
+	if (txq_data && txq_data->fcqs) {
+		rte_free(txq_data->fcqs);
+		txq_data->fcqs = NULL;
+	}
+	rte_free(txq_obj);
+	rte_errno = ret; /* Restore rte_errno. */
+	return NULL;
+}
+
+/**
  * Create the Tx queue Verbs object.
  *
  * @param dev
@@ -609,6 +894,8 @@ struct mlx5_txq_obj *
 
 	if (type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN)
 		return mlx5_txq_obj_hairpin_new(dev, idx);
+	if (type == MLX5_TXQ_OBJ_TYPE_DEVX_SQ)
+		return mlx5_txq_obj_devx_new(dev, idx);
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	/* If using DevX, need additional mask to read tisn value. */
 	if (priv->config.devx && !priv->sh->tdn)
@@ -817,8 +1104,10 @@ struct mlx5_txq_obj *
 		claim_zero(mlx5_glue->destroy_cq(tmpl.cq));
 	if (tmpl.qp)
 		claim_zero(mlx5_glue->destroy_qp(tmpl.qp));
-	if (txq_data->fcqs)
+	if (txq_data && txq_data->fcqs) {
 		rte_free(txq_data->fcqs);
+		txq_data->fcqs = NULL;
+	}
 	if (txq_obj)
 		rte_free(txq_obj);
 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
@@ -870,11 +1159,15 @@ struct mlx5_txq_obj *
 		if (txq_obj->type == MLX5_TXQ_OBJ_TYPE_DEVX_HAIRPIN) {
 			if (txq_obj->tis)
 				claim_zero(mlx5_devx_cmd_destroy(txq_obj->tis));
+		} else if (txq_obj->type == MLX5_TXQ_OBJ_TYPE_DEVX_SQ) {
+			txq_release_sq_resources(txq_obj);
 		} else {
 			claim_zero(mlx5_glue->destroy_qp(txq_obj->qp));
 			claim_zero(mlx5_glue->destroy_cq(txq_obj->cq));
-				if (txq_obj->txq_ctrl->txq.fcqs)
-					rte_free(txq_obj->txq_ctrl->txq.fcqs);
+		}
+		if (txq_obj->txq_ctrl->txq.fcqs) {
+			rte_free(txq_obj->txq_ctrl->txq.fcqs);
+			txq_obj->txq_ctrl->txq.fcqs = NULL;
 		}
 		LIST_REMOVE(txq_obj, next);
 		rte_free(txq_obj);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 08/16] net/mlx5: allocate packet pacing context
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (5 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 07/16] net/mlx5: create Tx queues with DevX Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 09/16] net/mlx5: introduce clock queue service routine Viacheslav Ovsiienko
                   ` (7 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

This patch allocates the Packet Pacing context from the kernel,
configures one according to requested pace send scheduling
granularuty and assigns to Clock Queue.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.h      |  2 ++
 drivers/net/mlx5/mlx5_txpp.c | 71 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index a1956cc..c1eafed 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -568,6 +568,8 @@ struct mlx5_dev_txpp {
 	struct mlx5dv_devx_event_channel *echan; /* Event Channel. */
 	struct mlx5_txpp_wq clock_queue; /* Clock Queue. */
 	struct mlx5_txpp_wq rearm_queue; /* Clock Queue. */
+	struct mlx5dv_pp *pp; /* Packet pacing context. */
+	uint16_t pp_id; /* Packet pacing context index. */
 };
 
 /*
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 34ac493..ebc24ba 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -6,6 +6,7 @@
 #include <rte_interrupts.h>
 #include <rte_alarm.h>
 #include <rte_malloc.h>
+#include <rte_cycles.h>
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
@@ -49,6 +50,69 @@
 }
 
 static void
+mlx5_txpp_free_pp_index(struct mlx5_dev_ctx_shared *sh)
+{
+	if (sh->txpp.pp) {
+		mlx5_glue->dv_free_pp(sh->txpp.pp);
+		sh->txpp.pp = NULL;
+		sh->txpp.pp_id = 0;
+	}
+}
+
+/* Allocate Packet Pacing index from kernel via mlx5dv call. */
+static int
+mlx5_txpp_alloc_pp_index(struct mlx5_dev_ctx_shared *sh)
+{
+#ifdef HAVE_MLX5DV_PP_ALLOC
+	uint32_t pp[MLX5_ST_SZ_DW(set_pp_rate_limit_context)];
+	uint64_t rate;
+
+	MLX5_ASSERT(!sh->txpp.pp);
+	memset(&pp, 0, sizeof(pp));
+	rate = NS_PER_S / sh->txpp.tick;
+	if (rate * sh->txpp.tick != NS_PER_S)
+		DRV_LOG(WARNING, "Packet pacing frequency is not precize.");
+	if (sh->txpp.test) {
+		uint32_t len;
+
+		len = RTE_MAX(MLX5_TXPP_TEST_PKT_SIZE,
+			      (size_t)RTE_ETHER_MIN_LEN);
+		MLX5_SET(set_pp_rate_limit_context, &pp,
+			 burst_upper_bound, len);
+		MLX5_SET(set_pp_rate_limit_context, &pp,
+			 typical_packet_size, len);
+		/* Convert packets per second into kilobits. */
+		rate = (rate * len) / (1000ul / CHAR_BIT);
+		DRV_LOG(INFO, "Packet pacing rate set to %" PRIu64, rate);
+	}
+	MLX5_SET(set_pp_rate_limit_context, &pp, rate_limit, rate);
+	MLX5_SET(set_pp_rate_limit_context, &pp, rate_mode,
+		 sh->txpp.test ? MLX5_DATA_RATE : MLX5_WQE_RATE);
+	sh->txpp.pp = mlx5_glue->dv_alloc_pp
+				(sh->ctx, sizeof(pp), &pp,
+				 MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX);
+	if (sh->txpp.pp == NULL) {
+		DRV_LOG(ERR, "Failed to allocate packet pacing index.");
+		rte_errno = errno;
+		return -errno;
+	}
+	if (!sh->txpp.pp->index) {
+		DRV_LOG(ERR, "Zero packet pacing index allocated.");
+		mlx5_txpp_free_pp_index(sh);
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	sh->txpp.pp_id = sh->txpp.pp->index;
+	return 0;
+#else
+	RTE_SET_USED(sh);
+	DRV_LOG(ERR, "Allocating pacing index is not supported.");
+	rte_errno = ENOTSUP;
+	return -ENOTSUP;
+#endif
+}
+
+static void
 mlx5_txpp_destroy_send_queue(struct mlx5_txpp_wq *wq)
 {
 	if (wq->sq)
@@ -457,6 +521,7 @@
 	}
 	sq_attr.state = MLX5_SQC_STATE_RST;
 	sq_attr.cqn = wq->cq->id;
+	sq_attr.packet_pacing_rate_limit_index = sh->txpp.pp_id;
 	sq_attr.wq_attr.cd_slave = 1;
 	sq_attr.wq_attr.uar_page = sh->tx_uar->page_id;
 	sq_attr.wq_attr.wq_type = MLX5_WQ_TYPE_CYCLIC;
@@ -503,6 +568,7 @@
  * - Clock CQ/SQ
  * - Rearm CQ/SQ
  * - attaches rearm interrupt handler
+ * - starts Clock Queue
  *
  * Returns 0 on success, negative otherwise
  */
@@ -520,6 +586,9 @@
 	ret = mlx5_txpp_create_eqn(sh);
 	if (ret)
 		goto exit;
+	ret = mlx5_txpp_alloc_pp_index(sh);
+	if (ret)
+		goto exit;
 	ret = mlx5_txpp_create_clock_queue(sh);
 	if (ret)
 		goto exit;
@@ -530,6 +599,7 @@
 	if (ret) {
 		mlx5_txpp_destroy_rearm_queue(sh);
 		mlx5_txpp_destroy_clock_queue(sh);
+		mlx5_txpp_free_pp_index(sh);
 		mlx5_txpp_destroy_eqn(sh);
 		sh->txpp.tick = 0;
 		sh->txpp.test = 0;
@@ -550,6 +620,7 @@
 {
 	mlx5_txpp_destroy_rearm_queue(sh);
 	mlx5_txpp_destroy_clock_queue(sh);
+	mlx5_txpp_free_pp_index(sh);
 	mlx5_txpp_destroy_eqn(sh);
 	sh->txpp.tick = 0;
 	sh->txpp.test = 0;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 09/16] net/mlx5: introduce clock queue service routine
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (6 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 08/16] net/mlx5: allocate packet pacing context Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 10/16] net/mlx5: prepare Tx queue structures to support timestamp Viacheslav Ovsiienko
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

Service routine is invoked periodically on Rearm Queue
completion interrupts, typically once per some milliseconds
(1-16) to track clock jitter and wander in robust fashion.
It performs the following:

- fetches the completed CQEs for Rearm Queue
- restarts Rearm Queue on errors
- pushes new requests to Rearm Queue to make it
  continuously running and pushing cross-channel requests
  to Clock Queue
- reads and caches the Clock Queue CQE to be used in datapath
- gathers statistics to estimate clock jitter and wander
- gathers Clock Queue errors statistics

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.h      |  15 ++
 drivers/net/mlx5/mlx5_defs.h |   1 +
 drivers/net/mlx5/mlx5_rxtx.h |  20 +++
 drivers/net/mlx5/mlx5_txpp.c | 318 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 354 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index c1eafed..52b38cc 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -555,6 +555,12 @@ struct mlx5_txpp_wq {
 	volatile uint32_t *sq_dbrec;
 };
 
+/* Tx packet pacing internal timestamp. */
+struct mlx5_txpp_ts {
+	rte_atomic64_t ci_ts;
+	rte_atomic64_t ts;
+};
+
 /* Tx packet pacing structure. */
 struct mlx5_dev_txpp {
 	pthread_mutex_t mutex; /* Pacing create/destroy mutex. */
@@ -570,6 +576,15 @@ struct mlx5_dev_txpp {
 	struct mlx5_txpp_wq rearm_queue; /* Clock Queue. */
 	struct mlx5dv_pp *pp; /* Packet pacing context. */
 	uint16_t pp_id; /* Packet pacing context index. */
+	uint16_t ts_n; /* Number of captured timestamps. */
+	uint16_t ts_p; /* Pointer to statisticks timestamp. */
+	struct mlx5_txpp_ts *tsa; /* Timestamps sliding window stats. */
+	struct mlx5_txpp_ts ts; /* Cached completion id/timestamp. */
+	uint32_t sync_lost:1; /* ci/timestamp synchronization lost. */
+	/* Statistics counters. */
+	rte_atomic32_t err_miss_int; /* Missed service interrupt. */
+	rte_atomic32_t err_rearm_queue; /* Rearm Queue errors. */
+	rte_atomic32_t err_clock_queue; /* Clock Queue errors. */
 };
 
 /*
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 35f02cb..b640d4a 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -172,6 +172,7 @@
 #define MLX5_TXDB_HEURISTIC 2
 
 /* Tx accurate scheduling on timestamps parameters. */
+#define MLX5_TXPP_WAIT_INIT_TS 1000ul /* How long to wait timestamp. */
 #define MLX5_TXPP_CLKQ_SIZE 1
 #define MLX5_TXPP_REARM	((1UL << MLX5_WQ_INDEX_WIDTH) / 4)
 #define MLX5_TXPP_REARM_SQ_SIZE (((1UL << MLX5_CQ_INDEX_WIDTH) / \
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1b797da..8a8d2b5 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -30,6 +30,7 @@
 #include <rte_io.h>
 #include <rte_bus_pci.h>
 #include <rte_malloc.h>
+#include <rte_cycles.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_prm.h>
@@ -695,4 +696,23 @@ int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
 	mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
 }
 
+/**
+ * Convert timestamp from HW format to linear counter
+ * from Packet Pacing Clock Queue CQE timestamp format.
+ *
+ * @param sh
+ *   Pointer to the device shared context. Might be needed
+ *   to convert according current device configuration.
+ * @param ts
+ *   Timestamp from CQE to convert.
+ * @return
+ *   UTC in nanoseconds
+ */
+static __rte_always_inline uint64_t
+mlx5_txpp_convert_rx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t ts)
+{
+	RTE_SET_USED(sh);
+	return (ts & UINT32_MAX) + (ts >> 32) * NS_PER_S;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index ebc24ba..3736f7a 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -1,6 +1,9 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2020 Mellanox Technologies, Ltd
  */
+#include <fcntl.h>
+#include <stdint.h>
+
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
 #include <rte_interrupts.h>
@@ -144,6 +147,33 @@
 	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
 
 	mlx5_txpp_destroy_send_queue(wq);
+	if (sh->txpp.tsa) {
+		rte_free(sh->txpp.tsa);
+		sh->txpp.tsa = NULL;
+	}
+}
+
+static void
+mlx5_txpp_doorbell_rearm_queue(struct mlx5_dev_ctx_shared *sh, uint16_t ci)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+	union {
+		uint32_t w32[2];
+		uint64_t w64;
+	} cs;
+
+	wq->sq_ci = ci + 1;
+	cs.w32[0] = rte_cpu_to_be_32(rte_be_to_cpu_32
+		   (wq->wqes[ci & (wq->sq_size - 1)].ctrl[0]) | (ci - 1) << 8);
+	cs.w32[1] = wq->wqes[ci & (wq->sq_size - 1)].ctrl[1];
+	/* Update SQ doorbell record with new SQ ci. */
+	rte_compiler_barrier();
+	*wq->sq_dbrec = rte_cpu_to_be_32(wq->sq_ci);
+	/* Make sure the doorbell record is updated. */
+	rte_wmb();
+	/* Write to doorbel register to start processing. */
+	__mlx5_uar_write64_relaxed(cs.w64, sh->tx_uar->reg_addr, NULL);
+	rte_wmb();
 }
 
 static void
@@ -433,6 +463,16 @@
 	uint32_t umem_size, umem_dbrec;
 	int ret;
 
+	sh->txpp.tsa = rte_zmalloc_socket(__func__,
+					   MLX5_TXPP_REARM_SQ_SIZE *
+					   sizeof(struct mlx5_txpp_ts),
+					   0, sh->numa_node);
+	if (!sh->txpp.tsa) {
+		DRV_LOG(ERR, "Failed to allocate memory for CQ stats.");
+		return -ENOMEM;
+	}
+	sh->txpp.ts_p = 0;
+	sh->txpp.ts_n = 0;
 	/* Allocate memory buffer for CQEs and doorbell record. */
 	umem_size = sizeof(struct mlx5_cqe) * MLX5_TXPP_CLKQ_SIZE;
 	umem_dbrec = RTE_ALIGN(umem_size, MLX5_DBR_SIZE);
@@ -562,6 +602,279 @@
 	return ret;
 }
 
+/* Enable notification from the Rearm Queue CQ. */
+static inline void
+mlx5_txpp_cq_arm(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *aq = &sh->txpp.rearm_queue;
+	uint32_t arm_sn = aq->arm_sn << MLX5_CQ_SQN_OFFSET;
+	uint32_t db_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | aq->cq_ci;
+	uint64_t db_be = rte_cpu_to_be_64(((uint64_t)db_hi << 32) | aq->cq->id);
+	uint32_t *addr = RTE_PTR_ADD(sh->tx_uar->base_addr, MLX5_CQ_DOORBELL);
+
+	rte_compiler_barrier();
+	aq->cq_dbrec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(db_hi);
+	rte_wmb();
+#ifdef RTE_ARCH_64
+	*(uint64_t *)addr = db_be;
+#else
+	*(uint32_t *)addr = db_be;
+	rte_io_wmb();
+	*((uint32_t *)addr + 1) = db_be >> 32;
+#endif
+	aq->arm_sn++;
+}
+
+static inline void
+mlx5_atomic_read_cqe(rte_int128_t *from, rte_int128_t *ts)
+{
+	/*
+	 * The only CQE of Clock Queue is being continiously
+	 * update by hardware with soecified rate. We have to
+	 * read timestump and WQE completion index atomically.
+	 */
+#ifdef RTE_ARCH_PPC_64
+	/* Power architecture does not support 16B compare-and-swap. */
+	MLX5_ASSERT(false);
+#else
+	rte_int128_t src;
+
+	memset(&src, 0, sizeof(src));
+	*ts = src;
+	/* if (*from == *ts) *from = *src else *ts = *from; */
+	rte_atomic128_cmp_exchange(from, ts, &src, 0,
+				   __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+#endif
+}
+
+/* Stores timestamp in the cache structure to share data with datapath. */
+static inline void
+mlx5_txpp_cache_timestamp(struct mlx5_dev_ctx_shared *sh,
+			   uint64_t ts, uint64_t ci)
+{
+	ci = ci << (64 - MLX5_CQ_INDEX_WIDTH);
+	ci |= (ts << MLX5_CQ_INDEX_WIDTH) >> MLX5_CQ_INDEX_WIDTH;
+	rte_compiler_barrier();
+	rte_atomic64_set(&sh->txpp.ts.ts, ts);
+	rte_atomic64_set(&sh->txpp.ts.ci_ts, ci);
+	rte_wmb();
+}
+
+/* Reads timestamp from Clock Queue CQE and stores in the cache. */
+static inline void
+mlx5_txpp_update_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+	struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
+	union {
+		rte_int128_t u128;
+		struct mlx5_cqe_ts cts;
+	} to;
+	uint64_t ts;
+	uint16_t ci;
+
+	static_assert(sizeof(struct mlx5_cqe_ts) == sizeof(rte_int128_t),
+		      "Wrong timestamp CQE part size");
+	mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
+	if (to.cts.op_own >> 4) {
+		DRV_LOG(DEBUG, "Clock Queue error sync lost.");
+		rte_atomic32_inc(&sh->txpp.err_clock_queue);
+		sh->txpp.sync_lost = 1;
+		return;
+	}
+	ci = rte_be_to_cpu_16(to.cts.wqe_counter);
+	ts = rte_be_to_cpu_64(to.cts.timestamp);
+	ts = mlx5_txpp_convert_rx_ts(sh, ts);
+	wq->cq_ci += (ci - wq->sq_ci) & UINT16_MAX;
+	wq->sq_ci = ci;
+	mlx5_txpp_cache_timestamp(sh, ts, wq->cq_ci);
+}
+
+/* Gather statistics for timestamp from Clock Queue CQE. */
+static inline void
+mlx5_txpp_gather_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+	/* Check whether we hava valid timestamp. */
+	if (!sh->txpp.clock_queue.sq_ci && !sh->txpp.ts_n)
+		return;
+	MLX5_ASSERT(sh->txpp.ts_p < MLX5_TXPP_REARM_SQ_SIZE);
+	sh->txpp.tsa[sh->txpp.ts_p] = sh->txpp.ts;
+	if (++sh->txpp.ts_p >= MLX5_TXPP_REARM_SQ_SIZE)
+		sh->txpp.ts_p = 0;
+	if (sh->txpp.ts_n < MLX5_TXPP_REARM_SQ_SIZE)
+		++sh->txpp.ts_n;
+}
+
+/* Waits for the first completion on Clock Queue to init timestamp. */
+static inline void
+mlx5_txpp_init_timestamp(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+	uint32_t wait;
+
+	sh->txpp.ts_p = 0;
+	sh->txpp.ts_n = 0;
+	for (wait = 0; wait < MLX5_TXPP_WAIT_INIT_TS; wait++) {
+		struct timespec onems;
+
+		mlx5_txpp_update_timestamp(sh);
+		if (wq->sq_ci)
+			return;
+		/* Wait one millisecond and try again. */
+		onems.tv_sec = 0;
+		onems.tv_nsec = NS_PER_S / MS_PER_S;
+		nanosleep(&onems, 0);
+	}
+	DRV_LOG(ERR, "Unable to initialize timestamp.");
+	sh->txpp.sync_lost = 1;
+}
+
+/* Handles Rearm Queue completions in periodic service. */
+static __rte_always_inline void
+mlx5_txpp_handle_rearm_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_txpp_wq *wq = &sh->txpp.rearm_queue;
+	uint32_t cq_ci = wq->cq_ci;
+	bool error = false;
+	int ret;
+
+	do {
+		volatile struct mlx5_cqe *cqe;
+
+		cqe = &wq->cqes[cq_ci & (MLX5_TXPP_REARM_CQ_SIZE - 1)];
+		ret = check_cqe(cqe, MLX5_TXPP_REARM_CQ_SIZE, cq_ci);
+		switch (ret) {
+		case MLX5_CQE_STATUS_ERR:
+			error = true;
+			++cq_ci;
+			break;
+		case MLX5_CQE_STATUS_SW_OWN:
+			wq->sq_ci += 2;
+			++cq_ci;
+			break;
+		case MLX5_CQE_STATUS_HW_OWN:
+			break;
+		default:
+			MLX5_ASSERT(false);
+			break;
+		}
+	} while (ret != MLX5_CQE_STATUS_HW_OWN);
+	if (likely(cq_ci != wq->cq_ci)) {
+		/* Check whether we have missed interrupts. */
+		if (cq_ci - wq->cq_ci != 1) {
+			DRV_LOG(DEBUG, "Rearm Queue missed interrupt.");
+			rte_atomic32_inc(&sh->txpp.err_miss_int);
+			/* Check sync lost on wqe index. */
+			if (cq_ci - wq->cq_ci >=
+				(((1UL << MLX5_WQ_INDEX_WIDTH) /
+				  MLX5_TXPP_REARM) - 1))
+				error = 1;
+		}
+		/* Update doorbell record to notify hardware. */
+		rte_compiler_barrier();
+		*wq->cq_dbrec = rte_cpu_to_be_32(cq_ci);
+		rte_wmb();
+		wq->cq_ci = cq_ci;
+		/* Fire new requests to Rearm Queue. */
+		if (error) {
+			DRV_LOG(DEBUG, "Rearm Queue error sync lost.");
+			rte_atomic32_inc(&sh->txpp.err_rearm_queue);
+			sh->txpp.sync_lost = 1;
+		}
+	}
+}
+
+/* Handles Clock Queue completions in periodic service. */
+static __rte_always_inline void
+mlx5_txpp_handle_clock_queue(struct mlx5_dev_ctx_shared *sh)
+{
+	mlx5_txpp_update_timestamp(sh);
+	mlx5_txpp_gather_timestamp(sh);
+}
+
+/* Invoked periodically on Rearm Queue completions. */
+static void
+mlx5_txpp_interrupt_handler(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_EVENT
+	RTE_SET_USED(cb_arg);
+	return;
+#else
+	struct mlx5_dev_ctx_shared *sh = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Process events in the loop. Only rearm completions are expected. */
+	while (mlx5_glue->devx_get_event
+				(sh->txpp.echan,
+				 &out.event_resp,
+				 sizeof(out.buf)) >=
+				 (ssize_t)sizeof(out.event_resp.cookie)) {
+		mlx5_txpp_handle_rearm_queue(sh);
+		mlx5_txpp_handle_clock_queue(sh);
+		mlx5_txpp_cq_arm(sh);
+		mlx5_txpp_doorbell_rearm_queue
+					(sh, sh->txpp.rearm_queue.sq_ci - 1);
+	}
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+static void
+mlx5_txpp_stop_service(struct mlx5_dev_ctx_shared *sh)
+{
+	if (!sh->txpp.intr_handle.fd)
+		return;
+	mlx5_intr_callback_unregister(&sh->txpp.intr_handle,
+				      mlx5_txpp_interrupt_handler, sh);
+	sh->txpp.intr_handle.fd = 0;
+}
+
+/* Attach interrupt handler and fires first request to Rearm Queue. */
+static int
+mlx5_txpp_start_service(struct mlx5_dev_ctx_shared *sh)
+{
+	uint16_t event_nums[1] = {0};
+	int flags;
+	int ret;
+
+	/* Attach interrupt handler to process Rearm Queue completions. */
+	flags = fcntl(sh->txpp.echan->fd, F_GETFL);
+	ret = fcntl(sh->txpp.echan->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change event channel FD.");
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	memset(&sh->txpp.intr_handle, 0, sizeof(sh->txpp.intr_handle));
+	sh->txpp.intr_handle.fd = sh->txpp.echan->fd;
+	sh->txpp.intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&sh->txpp.intr_handle,
+				       mlx5_txpp_interrupt_handler, sh)) {
+		sh->txpp.intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
+		return -rte_errno;
+	}
+	/* Subscribe CQ event to the event channel controlled by the driver. */
+	ret = mlx5_glue->devx_subscribe_devx_event(sh->txpp.echan,
+						   sh->txpp.rearm_queue.cq->obj,
+						   sizeof(event_nums),
+						   event_nums, 0);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe CQE event.");
+		rte_errno = errno;
+		return -errno;
+	}
+	/* Enable interrupts in the CQ. */
+	mlx5_txpp_cq_arm(sh);
+	/* Fire the first request on Rearm Queue. */
+	mlx5_txpp_doorbell_rearm_queue(sh, sh->txpp.rearm_queue.sq_size - 1);
+	mlx5_txpp_init_timestamp(sh);
+	return 0;
+}
+
 /*
  * The routine initializes the packet pacing infrastructure:
  * - allocates PP context
@@ -595,8 +908,12 @@
 	ret = mlx5_txpp_create_rearm_queue(sh);
 	if (ret)
 		goto exit;
+	ret = mlx5_txpp_start_service(sh);
+	if (ret)
+		goto exit;
 exit:
 	if (ret) {
+		mlx5_txpp_stop_service(sh);
 		mlx5_txpp_destroy_rearm_queue(sh);
 		mlx5_txpp_destroy_clock_queue(sh);
 		mlx5_txpp_free_pp_index(sh);
@@ -618,6 +935,7 @@
 static void
 mlx5_txpp_destroy(struct mlx5_dev_ctx_shared *sh)
 {
+	mlx5_txpp_stop_service(sh);
 	mlx5_txpp_destroy_rearm_queue(sh);
 	mlx5_txpp_destroy_clock_queue(sh);
 	mlx5_txpp_free_pp_index(sh);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 10/16] net/mlx5: prepare Tx queue structures to support timestamp
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (7 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 09/16] net/mlx5: introduce clock queue service routine Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 11/16] net/mlx5: convert timestamp to completion index Viacheslav Ovsiienko
                   ` (5 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

The fields to support send scheduling on dynamic timestamp
field are introduced and initialized on device start.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.h    |  4 ++++
 drivers/net/mlx5/mlx5_trigger.c |  2 ++
 drivers/net/mlx5/mlx5_txq.c     | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 8a8d2b5..974a847 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -313,6 +313,9 @@ struct mlx5_txq_data {
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	uint16_t port_id; /* Port ID of device. */
 	uint16_t idx; /* Queue index. */
+	uint64_t ts_mask; /* Timestamp flag dynamic mask. */
+	int32_t ts_offset; /* Timestamp field dynamic offset. */
+	struct mlx5_dev_ctx_shared *sh; /* Shared context. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
 #ifndef RTE_ARCH_64
 	rte_spinlock_t *uar_lock;
@@ -468,6 +471,7 @@ struct mlx5_txq_ctrl *mlx5_txq_hairpin_new
 void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl);
 void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl);
 uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev);
+void mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev);
 
 /* mlx5_rxtx.c */
 
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 449dd95..b713974 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -331,6 +331,8 @@
 	}
 	/* Set a mask and offset of dynamic metadata flows into Rx queues*/
 	mlx5_flow_rxq_dynf_metadata_set(dev);
+	/* Set a mask and offset of scheduling on timestamp into Tx queues*/
+	mlx5_txq_dynf_timestamp_set(dev);
 	/*
 	 * In non-cached mode, it only needs to start the default mreg copy
 	 * action and no flow created by application exists anymore.
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index a6f7e1c..d3b2863 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -1778,3 +1778,35 @@ struct mlx5_txq_ctrl *
 	}
 	return ret;
 }
+
+/**
+ * Set the Tx queue dynamic timestamp (mask and offset)
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ */
+void
+mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_txq_data *data;
+	int off, nbit;
+	unsigned int i;
+	uint64_t mask = 0;
+
+	nbit = rte_mbuf_dynflag_lookup
+				(RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL);
+	off = rte_mbuf_dynfield_lookup
+				(RTE_MBUF_DYNFIELD_TIMESTAMP_NAME, NULL);
+	if (nbit > 0 && off >= 0 && sh->txpp.refcnt)
+		mask = 1ULL << nbit;
+	for (i = 0; i != priv->txqs_n; ++i) {
+		data = (*priv->txqs)[i];
+		if (!data)
+			continue;
+		data->sh = sh;
+		data->ts_mask = mask;
+		data->ts_offset = off;
+	}
+}
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 11/16] net/mlx5: convert timestamp to completion index
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (8 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 10/16] net/mlx5: prepare Tx queue structures to support timestamp Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 12/16] net/mlx5: prepare Tx datapath to support sheduling Viacheslav Ovsiienko
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

The application provides timestamps in Tx mbuf as clocks,
the hardware performs scheduling on Clock Queue completion index
match. This patch introduces the timestamp-to-completion-index
inline routine.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.h      |  2 ++
 drivers/net/mlx5/mlx5_rxtx.h | 55 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_txpp.c |  5 ++++
 3 files changed, 62 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 52b38cc..a9a60fb 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -585,6 +585,8 @@ struct mlx5_dev_txpp {
 	rte_atomic32_t err_miss_int; /* Missed service interrupt. */
 	rte_atomic32_t err_rearm_queue; /* Rearm Queue errors. */
 	rte_atomic32_t err_clock_queue; /* Clock Queue errors. */
+	rte_atomic32_t err_ts_past; /* Timestamp in the past. */
+	rte_atomic32_t err_ts_future; /* Timestamp in the distant future. */
 };
 
 /*
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 974a847..d082cd7 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -719,4 +719,59 @@ int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
 	return (ts & UINT32_MAX) + (ts >> 32) * NS_PER_S;
 }
 
+/**
+ * Convert timestamp from mbuf format to linear counter
+ * of Clock Queue completions (24 bits)
+ *
+ * @param sh
+ *   Pointer to the device shared context to fetch Tx
+ *   packet pacing timestamp and parameters.
+ * @param ts
+ *   Timestamp from mbuf to convert.
+ * @return
+ *   positive or zero value - completion ID to wait
+ *   negative value - conversion error
+ */
+static __rte_always_inline int32_t
+mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts)
+{
+	uint64_t ts, ci;
+	uint32_t tick;
+
+	do {
+		/*
+		 * Read atomically two uint64_t fields and compare lsb bits.
+		 * It there is no match - the timestamp was updated in
+		 * the service thread, data should be re-read.
+		 */
+		rte_compiler_barrier();
+		ci = rte_atomic64_read(&sh->txpp.ts.ci_ts);
+		ts = rte_atomic64_read(&sh->txpp.ts.ts);
+		rte_compiler_barrier();
+		if (!((ts ^ ci) << (64 - MLX5_CQ_INDEX_WIDTH)))
+			break;
+	} while (true);
+	/* Perform the skew correction, positive value to send earlier. */
+	mts -= sh->txpp.skew;
+	mts -= ts;
+	if (unlikely(mts >= UINT64_MAX / 2)) {
+		/* We have negative integer, mts is in the past. */
+		rte_atomic32_inc(&sh->txpp.err_ts_past);
+		return -1;
+	}
+	tick = sh->txpp.tick;
+	MLX5_ASSERT(tick);
+	/* Convert delta to completions, round up. */
+	mts = (mts + tick - 1) / tick;
+	if (unlikely(mts >= (1 << MLX5_CQ_INDEX_WIDTH) / 2 - 1)) {
+		/* We have mts is too distant future. */
+		rte_atomic32_inc(&sh->txpp.err_ts_future);
+		return -1;
+	}
+	mts <<= 64 - MLX5_CQ_INDEX_WIDTH;
+	ci += mts;
+	ci >>= 64 - MLX5_CQ_INDEX_WIDTH;
+	return ci;
+}
+
 #endif /* RTE_PMD_MLX5_RXTX_H_ */
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 3736f7a..93dbeb2 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -840,6 +840,11 @@
 	int flags;
 	int ret;
 
+	rte_atomic32_set(&sh->txpp.err_miss_int, 0);
+	rte_atomic32_set(&sh->txpp.err_rearm_queue, 0);
+	rte_atomic32_set(&sh->txpp.err_clock_queue, 0);
+	rte_atomic32_set(&sh->txpp.err_ts_past, 0);
+	rte_atomic32_set(&sh->txpp.err_ts_future, 0);
 	/* Attach interrupt handler to process Rearm Queue completions. */
 	flags = fcntl(sh->txpp.echan->fd, F_GETFL);
 	ret = fcntl(sh->txpp.echan->fd, F_SETFL, flags | O_NONBLOCK);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 12/16] net/mlx5: prepare Tx datapath to support sheduling
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (9 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 11/16] net/mlx5: convert timestamp to completion index Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 13/16] net/mlx5: add scheduling support to send routine template Viacheslav Ovsiienko
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

The new static control flag is introduced to control
routine generating from template, enabling the scheduling
on timestamps.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 72 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/net/mlx5/mlx5_txq.c  |  2 ++
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index c456d20..1339744 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -66,6 +66,7 @@ enum mlx5_txcmp_code {
 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
+#define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Sheduling on timestamp.*/
 
 /* The most common offloads groups. */
 #define MLX5_TXOFF_CONFIG_NONE 0
@@ -5268,6 +5269,32 @@ enum mlx5_txcmp_code {
 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
 		MLX5_TXOFF_CONFIG_METADATA)
 
+/* Generate routines with timestamp scheduling. */
+MLX5_TXOFF_DECL(full_ts_nompw,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP)
+
+MLX5_TXOFF_DECL(full_ts,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP |
+		MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(none_ts,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP |
+		MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mdi_ts,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mti_ts,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_DECL(mtiv_ts,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP |
+		MLX5_TXOFF_CONFIG_EMPW)
 /*
  * Generate routines with Legacy Multi-Packet Write support.
  * This mode is supported by ConnectX-4 Lx only and imposes
@@ -5372,6 +5399,32 @@ enum mlx5_txcmp_code {
 		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
 		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_EMPW)
 
+MLX5_TXOFF_INFO(full_ts_nompw,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP)
+
+MLX5_TXOFF_INFO(full_ts,
+		MLX5_TXOFF_CONFIG_FULL | MLX5_TXOFF_CONFIG_TXPP |
+		MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(none_ts,
+		MLX5_TXOFF_CONFIG_NONE | MLX5_TXOFF_CONFIG_TXPP |
+		MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mdi_ts,
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mti_ts,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_METADATA |
+		MLX5_TXOFF_CONFIG_TXPP | MLX5_TXOFF_CONFIG_EMPW)
+
+MLX5_TXOFF_INFO(mtiv_ts,
+		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
+		MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_VLAN |
+		MLX5_TXOFF_CONFIG_METADATA | MLX5_TXOFF_CONFIG_TXPP |
+		MLX5_TXOFF_CONFIG_EMPW)
+
 MLX5_TXOFF_INFO(full,
 		MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_TSO |
 		MLX5_TXOFF_CONFIG_SWP |	MLX5_TXOFF_CONFIG_CSUM |
@@ -5518,6 +5571,14 @@ enum mlx5_txcmp_code {
 		/* We should support VLAN insertion. */
 		olx |= MLX5_TXOFF_CONFIG_VLAN;
 	}
+	if (tx_offloads & DEV_TX_OFFLOAD_SEND_ON_TIMESTAMP &&
+	    rte_mbuf_dynflag_lookup
+			(RTE_MBUF_DYNFLAG_TX_TIMESTAMP_NAME, NULL) > 0 &&
+	    rte_mbuf_dynfield_lookup
+			(RTE_MBUF_DYNFIELD_TIMESTAMP_NAME, NULL) > 0) {
+		/* Offload configured, dynamic entities registered. */
+		olx |= MLX5_TXOFF_CONFIG_TXPP;
+	}
 	if (priv->txqs_n && (*priv->txqs)[0]) {
 		struct mlx5_txq_data *txd = (*priv->txqs)[0];
 
@@ -5587,6 +5648,9 @@ enum mlx5_txcmp_code {
 		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_INLINE)
 			/* Do not enable inlining if not configured. */
 			continue;
+		if ((olx ^ tmp) & MLX5_TXOFF_CONFIG_TXPP)
+			/* Do not enable scheduling if not configured. */
+			continue;
 		/*
 		 * Some routine meets the requirements.
 		 * Check whether it has minimal amount
@@ -5631,6 +5695,8 @@ enum mlx5_txcmp_code {
 		DRV_LOG(DEBUG, "\tVLANI (VLAN insertion)");
 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_METADATA)
 		DRV_LOG(DEBUG, "\tMETAD (tx Flow metadata)");
+	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_TXPP)
+		DRV_LOG(DEBUG, "\tMETAD (tx Scheduling)");
 	if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_EMPW) {
 		if (txoff_func[m].olx & MLX5_TXOFF_CONFIG_MPW)
 			DRV_LOG(DEBUG, "\tMPW   (Legacy MPW)");
@@ -5705,7 +5771,7 @@ enum mlx5_txcmp_code {
 		if (pkt_burst == txoff_func[i].func) {
 			olx = txoff_func[i].olx;
 			snprintf(mode->info, sizeof(mode->info),
-				 "%s%s%s%s%s%s%s%s",
+				 "%s%s%s%s%s%s%s%s%s",
 				 (olx & MLX5_TXOFF_CONFIG_EMPW) ?
 				 ((olx & MLX5_TXOFF_CONFIG_MPW) ?
 				 "Legacy MPW" : "Enhanced MPW") : "No MPW",
@@ -5722,7 +5788,9 @@ enum mlx5_txcmp_code {
 				 (olx & MLX5_TXOFF_CONFIG_VLAN) ?
 				 " + VLAN" : "",
 				 (olx & MLX5_TXOFF_CONFIG_METADATA) ?
-				 " + METADATA" : "");
+				 " + METADATA" : "",
+				 (olx & MLX5_TXOFF_CONFIG_TXPP) ?
+				 " + TXPP" : "");
 			return 0;
 		}
 	}
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index d3b2863..f2c6145 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -119,6 +119,8 @@
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	if (config->tso)
 		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (config->tx_pp)
+		offloads |= DEV_TX_OFFLOAD_SEND_ON_TIMESTAMP;
 	if (config->swp) {
 		if (config->hw_csum)
 			offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 13/16] net/mlx5: add scheduling support to send routine template
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (10 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 12/16] net/mlx5: prepare Tx datapath to support sheduling Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 14/16] net/mlx5: add read device clock support Viacheslav Ovsiienko
                   ` (2 subsequent siblings)
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

This patch adds send scheduling on timestamps into tx_burst
routine template. The feature is controlled by static configuration
flag, the actual routines supporting the new feature are generated
over this updated template.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 162 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 161 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1339744..cdf5cc9 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -2404,6 +2404,37 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * Build the Synchronize Queue Segment with specified completion index.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param wqe
+ *   Pointer to WQE to fill with built Control Segment.
+ * @param wci
+ *   Completion index in Clock Queue to wait.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq,
+		  struct mlx5_txq_local *restrict loc __rte_unused,
+		  struct mlx5_wqe *restrict wqe,
+		  unsigned int wci,
+		  unsigned int olx __rte_unused)
+{
+	struct mlx5_wqe_qseg *qs;
+
+	qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
+	qs->max_index = rte_cpu_to_be_32(wci);
+	qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq->id);
+	qs->reserved0 = RTE_BE32(0);
+	qs->reserved1 = RTE_BE32(0);
+}
+
+/**
  * Build the Ethernet Segment without inlined data.
  * Supports Software Parser, Checksums and VLAN
  * insertion Tx offload features.
@@ -3241,6 +3272,59 @@ enum mlx5_txcmp_code {
 }
 
 /**
+ * The routine checks timestamp flag in the current packet,
+ * and push WAIT WQE into the queue if sheduling is required.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param loc
+ *   Pointer to burst routine local context.
+ * @param olx
+ *   Configured Tx offloads mask. It is fully defined at
+ *   compile time and may be used for optimization.
+ *
+ * @return
+ *   MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
+ *   MLX5_TXCMP_CODE_SINGLE - continue processing with the packet.
+ *   MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing.
+ * Local context variables partially updated.
+ */
+static __rte_always_inline enum mlx5_txcmp_code
+mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq,
+		      struct mlx5_txq_local *restrict loc,
+		      unsigned int olx)
+{
+	if (MLX5_TXOFF_CONFIG(TXPP) &&
+	    loc->mbuf->ol_flags & txq->ts_mask) {
+		struct mlx5_wqe *wqe;
+		uint64_t ts;
+		int32_t wci;
+
+		/*
+		 * Estimate the required space quickly and roughly.
+		 * We would like to ensure the packet can be pushed
+		 * to the queue and we won't get the orphan WAIT WQE.
+		 */
+		if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE ||
+		    loc->elts_free < NB_SEGS(loc->mbuf))
+			return MLX5_TXCMP_CODE_EXIT;
+		/* Convert the timestamp into completion to wait. */
+		ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *);
+		wci = mlx5_txpp_convert_tx_ts(txq->sh, ts);
+		if (unlikely(wci < 0))
+			return MLX5_TXCMP_CODE_SINGLE;
+		/* Build the WAIT WQE with specified completion. */
+		wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
+		mlx5_tx_cseg_init(txq, loc, wqe, 2, MLX5_OPCODE_WAIT, olx);
+		mlx5_tx_wseg_init(txq, loc, wqe, wci, olx);
+		++txq->wqe_ci;
+		--loc->wqe_free;
+		return MLX5_TXCMP_CODE_MULTI;
+	}
+	return MLX5_TXCMP_CODE_SINGLE;
+}
+
+/**
  * Tx one packet function for multi-segment TSO. Supports all
  * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
  * sends one packet per WQE.
@@ -3269,6 +3353,16 @@ enum mlx5_txcmp_code {
 	struct mlx5_wqe *restrict wqe;
 	unsigned int ds, dlen, inlen, ntcp, vlan = 0;
 
+	if (MLX5_TXOFF_CONFIG(TXPP)) {
+		enum mlx5_txcmp_code wret;
+
+		/* Generate WAIT for scheduling if requested. */
+		wret = mlx5_tx_schedule_send(txq, loc, olx);
+		if (wret == MLX5_TXCMP_CODE_EXIT)
+			return MLX5_TXCMP_CODE_EXIT;
+		if (wret == MLX5_TXCMP_CODE_ERROR)
+			return MLX5_TXCMP_CODE_ERROR;
+	}
 	/*
 	 * Calculate data length to be inlined to estimate
 	 * the required space in WQE ring buffer.
@@ -3360,6 +3454,16 @@ enum mlx5_txcmp_code {
 	unsigned int ds, nseg;
 
 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+	if (MLX5_TXOFF_CONFIG(TXPP)) {
+		enum mlx5_txcmp_code wret;
+
+		/* Generate WAIT for scheduling if requested. */
+		wret = mlx5_tx_schedule_send(txq, loc, olx);
+		if (wret == MLX5_TXCMP_CODE_EXIT)
+			return MLX5_TXCMP_CODE_EXIT;
+		if (wret == MLX5_TXCMP_CODE_ERROR)
+			return MLX5_TXCMP_CODE_ERROR;
+	}
 	/*
 	 * No inline at all, it means the CPU cycles saving
 	 * is prioritized at configuration, we should not
@@ -3468,6 +3572,16 @@ enum mlx5_txcmp_code {
 
 	MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
 	MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
+	if (MLX5_TXOFF_CONFIG(TXPP)) {
+		enum mlx5_txcmp_code wret;
+
+		/* Generate WAIT for scheduling if requested. */
+		wret = mlx5_tx_schedule_send(txq, loc, olx);
+		if (wret == MLX5_TXCMP_CODE_EXIT)
+			return MLX5_TXCMP_CODE_EXIT;
+		if (wret == MLX5_TXCMP_CODE_ERROR)
+			return MLX5_TXCMP_CODE_ERROR;
+	}
 	/*
 	 * First calculate data length to be inlined
 	 * to estimate the required space for WQE.
@@ -3730,6 +3844,16 @@ enum mlx5_txcmp_code {
 		uint8_t *dptr;
 
 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(TXPP)) {
+			enum mlx5_txcmp_code wret;
+
+			/* Generate WAIT for scheduling if requested. */
+			wret = mlx5_tx_schedule_send(txq, loc, olx);
+			if (wret == MLX5_TXCMP_CODE_EXIT)
+				return MLX5_TXCMP_CODE_EXIT;
+			if (wret == MLX5_TXCMP_CODE_ERROR)
+				return MLX5_TXCMP_CODE_ERROR;
+		}
 		dlen = rte_pktmbuf_data_len(loc->mbuf);
 		if (MLX5_TXOFF_CONFIG(VLAN) &&
 		    loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
@@ -3892,7 +4016,7 @@ enum mlx5_txcmp_code {
  *  false - no match, eMPW should be restarted.
  */
 static __rte_always_inline bool
-mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq __rte_unused,
+mlx5_tx_match_empw(struct mlx5_txq_data *restrict txq,
 		   struct mlx5_wqe_eseg *restrict es,
 		   struct mlx5_txq_local *restrict loc,
 		   uint32_t dlen,
@@ -3921,6 +4045,10 @@ enum mlx5_txcmp_code {
 	/* There must be no VLAN packets in eMPW loop. */
 	if (MLX5_TXOFF_CONFIG(VLAN))
 		MLX5_ASSERT(!(loc->mbuf->ol_flags & PKT_TX_VLAN_PKT));
+	/* Check if the scheduling is requested. */
+	if (MLX5_TXOFF_CONFIG(TXPP) &&
+	    loc->mbuf->ol_flags & txq->ts_mask)
+		return false;
 	return true;
 }
 
@@ -4106,6 +4234,16 @@ enum mlx5_txcmp_code {
 
 next_empw:
 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(TXPP)) {
+			enum mlx5_txcmp_code wret;
+
+			/* Generate WAIT for scheduling if requested. */
+			wret = mlx5_tx_schedule_send(txq, loc, olx);
+			if (wret == MLX5_TXCMP_CODE_EXIT)
+				return MLX5_TXCMP_CODE_EXIT;
+			if (wret == MLX5_TXCMP_CODE_ERROR)
+				return MLX5_TXCMP_CODE_ERROR;
+		}
 		part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
 				       MLX5_MPW_MAX_PACKETS :
 				       MLX5_EMPW_MAX_PACKETS);
@@ -4201,6 +4339,7 @@ enum mlx5_txcmp_code {
 			 * - metadata value
 			 * - software parser settings
 			 * - packets length (legacy MPW only)
+			 * - scheduling is not required
 			 */
 			if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
 				MLX5_ASSERT(loop);
@@ -4271,6 +4410,16 @@ enum mlx5_txcmp_code {
 		unsigned int slen = 0;
 
 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(TXPP)) {
+			enum mlx5_txcmp_code wret;
+
+			/* Generate WAIT for scheduling if requested. */
+			wret = mlx5_tx_schedule_send(txq, loc, olx);
+			if (wret == MLX5_TXCMP_CODE_EXIT)
+				return MLX5_TXCMP_CODE_EXIT;
+			if (wret == MLX5_TXCMP_CODE_ERROR)
+				return MLX5_TXCMP_CODE_ERROR;
+		}
 		/*
 		 * Limits the amount of packets in one WQE
 		 * to improve CQE latency generation.
@@ -4496,6 +4645,7 @@ enum mlx5_txcmp_code {
 			 * - metadata value
 			 * - software parser settings
 			 * - packets length (legacy MPW only)
+			 * - scheduling is not required
 			 */
 			if (!mlx5_tx_match_empw(txq, &wqem->eseg,
 						loc, dlen, olx))
@@ -4545,6 +4695,16 @@ enum mlx5_txcmp_code {
 		enum mlx5_txcmp_code ret;
 
 		MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
+		if (MLX5_TXOFF_CONFIG(TXPP)) {
+			enum mlx5_txcmp_code wret;
+
+			/* Generate WAIT for scheduling if requested. */
+			wret = mlx5_tx_schedule_send(txq, loc, olx);
+			if (wret == MLX5_TXCMP_CODE_EXIT)
+				return MLX5_TXCMP_CODE_EXIT;
+			if (wret == MLX5_TXCMP_CODE_ERROR)
+				return MLX5_TXCMP_CODE_ERROR;
+		}
 		if (MLX5_TXOFF_CONFIG(INLINE)) {
 			unsigned int inlen, vlan = 0;
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 14/16] net/mlx5: add read device clock support
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (11 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 13/16] net/mlx5: add scheduling support to send routine template Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 15/16] net/mlx5: provide the send scheduling error statistics Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 16/16] common/mlx5: add register access DevX routine Viacheslav Ovsiienko
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

If send schedule feature is engaged there is the Clock Queue
created, that reports reliable the currect device clock counter
value. The device clock counter can be read directly from the
Clock Queue CQE.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/linux/mlx5_os.c |  4 ++-
 drivers/net/mlx5/mlx5.h          |  1 +
 drivers/net/mlx5/mlx5_txpp.c     | 55 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index ff93095..c2326a5 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -2342,7 +2342,7 @@
 	.xstats_get_names = mlx5_xstats_get_names,
 	.fw_version_get = mlx5_fw_version_get,
 	.dev_infos_get = mlx5_dev_infos_get,
-	.read_clock = mlx5_read_clock,
+	.read_clock = mlx5_txpp_read_clock,
 	.dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
 	.vlan_filter_set = mlx5_vlan_filter_set,
 	.rx_queue_setup = mlx5_rx_queue_setup,
@@ -2391,6 +2391,7 @@
 	.xstats_get_names = mlx5_xstats_get_names,
 	.fw_version_get = mlx5_fw_version_get,
 	.dev_infos_get = mlx5_dev_infos_get,
+	.read_clock = mlx5_txpp_read_clock,
 	.rx_descriptor_status = mlx5_rx_descriptor_status,
 	.tx_descriptor_status = mlx5_tx_descriptor_status,
 	.rxq_info_get = mlx5_rxq_info_get,
@@ -2421,6 +2422,7 @@
 	.xstats_get_names = mlx5_xstats_get_names,
 	.fw_version_get = mlx5_fw_version_get,
 	.dev_infos_get = mlx5_dev_infos_get,
+	.read_clock = mlx5_txpp_read_clock,
 	.dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
 	.vlan_filter_set = mlx5_vlan_filter_set,
 	.rx_queue_setup = mlx5_rx_queue_setup,
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index a9a60fb..31cd37f 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1010,5 +1010,6 @@ void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
 
 int mlx5_txpp_start(struct rte_eth_dev *dev);
 void mlx5_txpp_stop(struct rte_eth_dev *dev);
+int mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 93dbeb2..202e6b3 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -1035,3 +1035,58 @@
 	MLX5_ASSERT(!ret);
 	RTE_SET_USED(ret);
 }
+
+/*
+ * Read the current clock counter of an Ethernet device
+ *
+ * This returns the current raw clock value of an Ethernet device. It is
+ * a raw amount of ticks, with no given time reference.
+ * The value returned here is from the same clock than the one
+ * filling timestamp field of Rx/Tx packets when using hardware timestamp
+ * offload. Therefore it can be used to compute a precise conversion of
+ * the device clock to the real time.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param clock
+ *   Pointer to the uint64_t that holds the raw clock value.
+ *
+ * @return
+ *   - 0: Success.
+ *   - -ENOTSUP: The function is not supported in this mode. Requires
+ *     packet pacing module configured and started (tx_pp devarg)
+ */
+int
+mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	int ret;
+
+	if (sh->txpp.refcnt) {
+		struct mlx5_txpp_wq *wq = &sh->txpp.clock_queue;
+		struct mlx5_cqe *cqe = (struct mlx5_cqe *)(uintptr_t)wq->cqes;
+		union {
+			rte_int128_t u128;
+			struct mlx5_cqe_ts cts;
+		} to;
+		uint64_t ts;
+
+		mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
+		if (to.cts.op_own >> 4) {
+			DRV_LOG(DEBUG, "Clock Queue error sync lost.");
+			rte_atomic32_inc(&sh->txpp.err_clock_queue);
+			sh->txpp.sync_lost = 1;
+			return -EIO;
+		}
+		ts = rte_be_to_cpu_64(to.cts.timestamp);
+		ts = mlx5_txpp_convert_rx_ts(sh, ts);
+		*timestamp = ts;
+		return 0;
+	}
+	/* Not supported in isolated mode - kernel does not see the CQEs. */
+	if (priv->isolated || rte_eal_process_type() != RTE_PROC_PRIMARY)
+		return -ENOTSUP;
+	ret = mlx5_read_clock(dev, timestamp);
+	return ret;
+}
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 15/16] net/mlx5: provide the send scheduling error statistics
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (12 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 14/16] net/mlx5: add read device clock support Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 16/16] common/mlx5: add register access DevX routine Viacheslav Ovsiienko
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

The mlx5 PMD exposes the following new introduced
extended statistics counter to report the errors
of packet send scheduling on timestamps:

  - txpp_err_miss_int - rearm queue interrupt was not handled
    was not handled in time and service routine might miss
    the completions

  - txpp_err_rearm_queue - reports errors in rearm queue
  - txpp_err_clock_queue - reports errors in clock queue

  - txpp_err_ts_past - timestamps in the packet being sent
    were found in the past, timestamps were ignored

  - txpp_err_ts_future - timestamps in the packet being sent
    were found in the too distant future (beyond HW/clock queue
    capabilities to schedule, typically it is about 16M of
    tx_pp devarg periods)

  - txpp_jitter - estimated jitter in device clocks between
    8K completions of Clock Queue.

  - txpp_wander - estimated wander in device clocks between
    16M completions of Clock Queue.

  - txpp_sync_lost - error flag, the Clock Queue completions
    synchronization is lost, accurate packet scheduling can
    not be handled, timestamps are being ignored, the restart
    of all ports using scheduling must be performed.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5.h       |   7 ++
 drivers/net/mlx5/mlx5_stats.c |   7 +-
 drivers/net/mlx5/mlx5_txpp.c  | 219 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 231 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 31cd37f..5c82a25 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1011,5 +1011,12 @@ void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
 int mlx5_txpp_start(struct rte_eth_dev *dev);
 void mlx5_txpp_stop(struct rte_eth_dev *dev);
 int mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp);
+int mlx5_txpp_xstats_get(struct rte_eth_dev *dev,
+			 struct rte_eth_xstat *stats,
+			 unsigned int n, unsigned int n_used);
+int mlx5_txpp_xstats_reset(struct rte_eth_dev *dev);
+int mlx5_txpp_xstats_get_names(struct rte_eth_dev *dev,
+			       struct rte_eth_xstat_name *xstats_names,
+			       unsigned int n, unsigned int n_used);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_stats.c b/drivers/net/mlx5/mlx5_stats.c
index a9b33ee..e30542e 100644
--- a/drivers/net/mlx5/mlx5_stats.c
+++ b/drivers/net/mlx5/mlx5_stats.c
@@ -75,6 +75,7 @@
 			}
 		}
 	}
+	mlx5_stats_n = mlx5_txpp_xstats_get(dev, stats, n, mlx5_stats_n);
 	return mlx5_stats_n;
 }
 
@@ -237,7 +238,7 @@
 		xstats_ctrl->base[i] = counters[i];
 		xstats_ctrl->hw_stats[i] = 0;
 	}
-
+	mlx5_txpp_xstats_reset(dev);
 	return 0;
 }
 
@@ -255,7 +256,7 @@
  *   Number of xstats names.
  */
 int
-mlx5_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
+mlx5_xstats_get_names(struct rte_eth_dev *dev,
 		      struct rte_eth_xstat_name *xstats_names, unsigned int n)
 {
 	unsigned int i;
@@ -271,5 +272,7 @@
 			xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0;
 		}
 	}
+	mlx5_xstats_n = mlx5_txpp_xstats_get_names(dev, xstats_names,
+						   n, mlx5_xstats_n);
 	return mlx5_xstats_n;
 }
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 202e6b3..cbd0683 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -15,6 +15,17 @@
 #include "mlx5_rxtx.h"
 #include "mlx5_common_os.h"
 
+static const char * const mlx5_txpp_stat_names[] = {
+	"txpp_err_miss_int", /* Missed service interrupt. */
+	"txpp_err_rearm_queue",	/* Rearm Queue errors. */
+	"txpp_err_clock_queue", /* Clock Queue errors. */
+	"txpp_err_ts_past", /* Timestamp in the past. */
+	"txpp_err_ts_future", /* Timestamp in the distant future. */
+	"txpp_jitter", /* Timestamp jitter (one Clock Queue completion). */
+	"txpp_wander", /* Timestamp jitter (half of Clock Queue completions). */
+	"txpp_sync_lost", /* Scheduling synchronization lost. */
+};
+
 /* Destroy Event Queue Notification Channel. */
 static void
 mlx5_txpp_destroy_eqn(struct mlx5_dev_ctx_shared *sh)
@@ -1090,3 +1101,211 @@
 	ret = mlx5_read_clock(dev, timestamp);
 	return ret;
 }
+
+/**
+ * DPDK callback to clear device extended statistics.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success and stats is reset, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int mlx5_txpp_xstats_reset(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+
+	rte_atomic32_set(&sh->txpp.err_miss_int, 0);
+	rte_atomic32_set(&sh->txpp.err_rearm_queue, 0);
+	rte_atomic32_set(&sh->txpp.err_clock_queue, 0);
+	rte_atomic32_set(&sh->txpp.err_ts_past, 0);
+	rte_atomic32_set(&sh->txpp.err_ts_future, 0);
+	return 0;
+}
+
+/**
+ * Routine to retrieve names of extended device statistics
+ * for packet send scheduling. It appends the specific stats names
+ * after the parts filled by preceding modules (eth stats, etc.)
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] xstats_names
+ *   Buffer to insert names into.
+ * @param n
+ *   Number of names.
+ * @param n_used
+ *   Number of names filled by preceding statistics modules.
+ *
+ * @return
+ *   Number of xstats names.
+ */
+int mlx5_txpp_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
+			       struct rte_eth_xstat_name *xstats_names,
+			       unsigned int n, unsigned int n_used)
+{
+	unsigned int n_txpp = RTE_DIM(mlx5_txpp_stat_names);
+	unsigned int i;
+
+	if (n >= n_used + n_txpp && xstats_names) {
+		for (i = 0; i < n_txpp; ++i) {
+			strncpy(xstats_names[i + n_used].name,
+				mlx5_txpp_stat_names[i],
+				RTE_ETH_XSTATS_NAME_SIZE);
+			xstats_names[i].name[RTE_ETH_XSTATS_NAME_SIZE - 1] = 0;
+		}
+	}
+	return n_used + n_txpp;
+}
+
+static inline void
+mlx5_txpp_read_tsa(struct mlx5_dev_txpp *txpp,
+		   struct mlx5_txpp_ts *tsa, uint16_t idx)
+{
+	do {
+		int64_t ts, ci;
+
+		ts = rte_atomic64_read(&txpp->tsa[idx].ts);
+		ci = rte_atomic64_read(&txpp->tsa[idx].ci_ts);
+		rte_compiler_barrier();
+		if ((ci ^ ts) << MLX5_CQ_INDEX_WIDTH != 0)
+			continue;
+		if (rte_atomic64_read(&txpp->tsa[idx].ts) != ts)
+			continue;
+		if (rte_atomic64_read(&txpp->tsa[idx].ci_ts) != ci)
+			continue;
+		rte_atomic64_set(&tsa->ts, ts);
+		rte_atomic64_set(&tsa->ci_ts, ci);
+		return;
+	} while (true);
+}
+
+/*
+ * Jitter reflects the clock change between
+ * neighbours Clock Queue completions.
+ */
+static uint64_t
+mlx5_txpp_xstats_jitter(struct mlx5_dev_txpp *txpp)
+{
+	struct mlx5_txpp_ts tsa0, tsa1;
+	int64_t dts, dci;
+	uint16_t ts_p;
+
+	if (txpp->ts_n < 2) {
+		/* No gathered enough reports yet. */
+		return 0;
+	}
+	do {
+		int ts_0, ts_1;
+
+		ts_p = txpp->ts_p;
+		rte_compiler_barrier();
+		ts_0 = ts_p - 2;
+		if (ts_0 < 0)
+			ts_0 += MLX5_TXPP_REARM_SQ_SIZE;
+		ts_1 = ts_p - 1;
+		if (ts_1 < 0)
+			ts_1 += MLX5_TXPP_REARM_SQ_SIZE;
+		mlx5_txpp_read_tsa(txpp, &tsa0, ts_0);
+		mlx5_txpp_read_tsa(txpp, &tsa1, ts_1);
+		rte_compiler_barrier();
+	} while (ts_p != txpp->ts_p);
+	/* We have two neighbor reports, calculate the jitter. */
+	dts = rte_atomic64_read(&tsa1.ts) - rte_atomic64_read(&tsa0.ts);
+	dci = (rte_atomic64_read(&tsa1.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH)) -
+	      (rte_atomic64_read(&tsa0.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH));
+	if (dci < 0)
+		dci += 1 << MLX5_CQ_INDEX_WIDTH;
+	dci *= txpp->tick;
+	return (dts > dci) ? dts - dci : dci - dts;
+}
+
+/*
+ * Wander reflects the long-term clock change
+ * over the entire length of all Clock Queue completions.
+ */
+static uint64_t
+mlx5_txpp_xstats_wander(struct mlx5_dev_txpp *txpp)
+{
+	struct mlx5_txpp_ts tsa0, tsa1;
+	int64_t dts, dci;
+	uint16_t ts_p;
+
+	if (txpp->ts_n < MLX5_TXPP_REARM_SQ_SIZE) {
+		/* No gathered enough reports yet. */
+		return 0;
+	}
+	do {
+		int ts_0, ts_1;
+
+		ts_p = txpp->ts_p;
+		rte_compiler_barrier();
+		ts_0 = ts_p - MLX5_TXPP_REARM_SQ_SIZE / 2 - 1;
+		if (ts_0 < 0)
+			ts_0 += MLX5_TXPP_REARM_SQ_SIZE;
+		ts_1 = ts_p - 1;
+		if (ts_1 < 0)
+			ts_1 += MLX5_TXPP_REARM_SQ_SIZE;
+		mlx5_txpp_read_tsa(txpp, &tsa0, ts_0);
+		mlx5_txpp_read_tsa(txpp, &tsa1, ts_1);
+		rte_compiler_barrier();
+	} while (ts_p != txpp->ts_p);
+	/* We have two neighbor reports, calculate the jitter. */
+	dts = rte_atomic64_read(&tsa1.ts) - rte_atomic64_read(&tsa0.ts);
+	dci = (rte_atomic64_read(&tsa1.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH)) -
+	      (rte_atomic64_read(&tsa0.ci_ts) >> (64 - MLX5_CQ_INDEX_WIDTH));
+	dci += 1 << MLX5_CQ_INDEX_WIDTH;
+	dci *= txpp->tick;
+	return (dts > dci) ? dts - dci : dci - dts;
+}
+
+/**
+ * Routine to retrieve extended device statistics
+ * for packet send scheduling. It appends the specific statistics
+ * after the parts filled by preceding modules (eth stats, etc.)
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] stats
+ *   Pointer to rte extended stats table.
+ * @param n
+ *   The size of the stats table.
+ * @param n_used
+ *   Number of stats filled by preceding statistics modules.
+ *
+ * @return
+ *   Number of extended stats on success and stats is filled,
+ *   negative on error and rte_errno is set.
+ */
+int
+mlx5_txpp_xstats_get(struct rte_eth_dev *dev,
+		     struct rte_eth_xstat *stats,
+		     unsigned int n, unsigned int n_used)
+{
+	unsigned int n_txpp = RTE_DIM(mlx5_txpp_stat_names);
+
+	if (n >= n_used + n_txpp && stats) {
+		struct mlx5_priv *priv = dev->data->dev_private;
+		struct mlx5_dev_ctx_shared *sh = priv->sh;
+		unsigned int i;
+
+		for (i = 0; i < n_txpp; ++i)
+			stats[n_used + i].id = n_used + i;
+		stats[n_used + 0].value =
+				rte_atomic32_read(&sh->txpp.err_miss_int);
+		stats[n_used + 1].value =
+				rte_atomic32_read(&sh->txpp.err_rearm_queue);
+		stats[n_used + 2].value =
+				rte_atomic32_read(&sh->txpp.err_clock_queue);
+		stats[n_used + 3].value =
+				rte_atomic32_read(&sh->txpp.err_ts_past);
+		stats[n_used + 4].value =
+				rte_atomic32_read(&sh->txpp.err_ts_future);
+		stats[n_used + 5].value = mlx5_txpp_xstats_jitter(&sh->txpp);
+		stats[n_used + 6].value = mlx5_txpp_xstats_wander(&sh->txpp);
+		stats[n_used + 7].value = sh->txpp.sync_lost;
+	}
+	return n_used + n_txpp;
+}
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [dpdk-dev] [PATCH v1 16/16] common/mlx5: add register access DevX routine
  2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
                   ` (13 preceding siblings ...)
  2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 15/16] net/mlx5: provide the send scheduling error statistics Viacheslav Ovsiienko
@ 2020-07-10  9:48 ` Viacheslav Ovsiienko
  14 siblings, 0 replies; 16+ messages in thread
From: Viacheslav Ovsiienko @ 2020-07-10  9:48 UTC (permalink / raw)
  To: dev; +Cc: matan, rasland, thomas, ferruh.yigit

The DevX routine to read/write NIC registers via DevX API is added.
This is the preparation step to check timestamp modes and units
and gather the extended statistics.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/common/mlx5/mlx5_devx_cmds.c            | 57 +++++++++++++++++++++++++
 drivers/common/mlx5/mlx5_devx_cmds.h            |  4 ++
 drivers/common/mlx5/mlx5_prm.h                  | 25 +++++++++++
 drivers/common/mlx5/rte_common_mlx5_version.map |  1 +
 4 files changed, 87 insertions(+)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 093636c..5b99e11 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -12,6 +12,63 @@
 
 
 /**
+ * Perform access to the registers. Reads data from and writes data to
+ * the specified register.
+ *
+ * @param[in] ctx
+ *   Context returned from mlx5 open_device() glue function.
+ * @param[in] reg_id
+ *   Register identifier according to the PRM.
+ * @param[in] arg
+ *   Register access auxiliary parameter according to the PRM.
+ * @param[inout] value
+ *   Pointer to the value to be wriiten to the register or
+ *   to the buffer where the read data to be stored.
+ * @param[in] write
+ *   Non-zero value means write to the register should be performed,
+ *   otherwise read access will be performed.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int
+mlx5_devx_cmd_register_access(void *ctx, uint16_t reg_id,
+			      uint32_t arg, uint32_t *value,
+			      uint32_t write)
+{
+	uint32_t in[MLX5_ST_SZ_DW(access_register_in)]   = {0};
+	uint32_t out[MLX5_ST_SZ_DW(access_register_out)] = {0};
+	int status, rc;
+
+	MLX5_SET(access_register_in, in, opcode, MLX5_CMD_OP_ACCESS_REGISTER);
+	MLX5_SET(access_register_in, in, op_mod, write ?
+					MLX5_ACCESS_REGISTER_IN_OP_MOD_WRITE :
+					MLX5_ACCESS_REGISTER_IN_OP_MOD_READ);
+	MLX5_SET(access_register_in, in, register_id, reg_id);
+	MLX5_SET(access_register_in, in, argument, arg);
+	if (write && value)
+		MLX5_SET(access_register_in, in, register_data, *value);
+	rc = mlx5_glue->devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out));
+	if (rc)
+		goto error;
+	status = MLX5_GET(access_register_out, out, status);
+	if (status) {
+		int syndrome = MLX5_GET(access_register_out, out, syndrome);
+
+		DRV_LOG(DEBUG, "Failed to access NIC register 0x%X, "
+			       "status %x, syndrome = %x",
+			       reg_id, status, syndrome);
+		return -1;
+	}
+	if (value && !write)
+		*value = MLX5_GET(access_register_out, out, register_data);
+	return 0;
+error:
+	rc = (rc > 0) ? -rc : rc;
+	return rc;
+};
+
+/**
  * Allocate flow counters via devx interface.
  *
  * @param[in] ctx
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index c79b349..119479d 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -383,6 +383,10 @@ int mlx5_devx_cmd_modify_qp_state(struct mlx5_devx_obj *qp,
 int mlx5_devx_cmd_modify_rqt(struct mlx5_devx_obj *rqt,
 			     struct mlx5_devx_rqt_attr *rqt_attr);
 
+__rte_internal
+int mlx5_devx_cmd_register_access(void *ctx, uint16_t reg_id,
+				  uint32_t arg, uint32_t *value,
+				  uint32_t write);
 /**
  * Create virtio queue counters object DevX API.
  *
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index 8705b42..6575edc 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -776,6 +776,7 @@ enum {
 	MLX5_CMD_OP_SUSPEND_QP = 0x50F,
 	MLX5_CMD_OP_RESUME_QP = 0x510,
 	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
+	MLX5_CMD_OP_ACCESS_REGISTER = 0x805,
 	MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN = 0x816,
 	MLX5_CMD_OP_CREATE_TIR = 0x900,
 	MLX5_CMD_OP_CREATE_SQ = 0X904,
@@ -2545,6 +2546,30 @@ struct mlx5_ifc_set_pp_rate_limit_context_bits {
 	u8 reserved_at_60[0x120];
 };
 
+struct mlx5_ifc_access_register_out_bits {
+	u8 status[0x8];
+	u8 reserved_at_8[0x18];
+	u8 syndrome[0x20];
+	u8 reserved_at_40[0x40];
+	u8 register_data[0][0x20];
+};
+
+enum {
+	MLX5_ACCESS_REGISTER_IN_OP_MOD_WRITE  = 0x0,
+	MLX5_ACCESS_REGISTER_IN_OP_MOD_READ   = 0x1,
+};
+
+struct mlx5_ifc_access_register_in_bits {
+	u8 opcode[0x10];
+	u8 reserved_at_10[0x10];
+	u8 reserved_at_20[0x10];
+	u8 op_mod[0x10];
+	u8 reserved_at_40[0x10];
+	u8 register_id[0x10];
+	u8 argument[0x20];
+	u8 register_data[0][0x20];
+};
+
 /* CQE format mask. */
 #define MLX5E_CQE_FORMAT_MASK 0xc
 
diff --git a/drivers/common/mlx5/rte_common_mlx5_version.map b/drivers/common/mlx5/rte_common_mlx5_version.map
index ae57ebd..123b460 100644
--- a/drivers/common/mlx5/rte_common_mlx5_version.map
+++ b/drivers/common/mlx5/rte_common_mlx5_version.map
@@ -34,6 +34,7 @@ INTERNAL {
 	mlx5_devx_cmd_query_hca_attr;
 	mlx5_devx_cmd_query_virtio_q_counters;
 	mlx5_devx_cmd_query_virtq;
+	mlx5_devx_cmd_register_access;
 	mlx5_devx_get_out_command_status;
 
 	mlx5_get_ifname_sysfs;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2020-07-10  9:51 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-10  9:48 [dpdk-dev] [PATCH v1 01/16] common/mlx5: update common part to support packet pacing Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 02/16] net/mlx5: introduce send scheduling devargs Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 03/16] net/mlx5: fix UAR lock sharing for multiport devices Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 04/16] net/mlx5: introduce shared UAR resource Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 05/16] net/mlx5: create clock queue for packet pacing Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 06/16] net/mlx5: create rearm " Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 07/16] net/mlx5: create Tx queues with DevX Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 08/16] net/mlx5: allocate packet pacing context Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 09/16] net/mlx5: introduce clock queue service routine Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 10/16] net/mlx5: prepare Tx queue structures to support timestamp Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 11/16] net/mlx5: convert timestamp to completion index Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 12/16] net/mlx5: prepare Tx datapath to support sheduling Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 13/16] net/mlx5: add scheduling support to send routine template Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 14/16] net/mlx5: add read device clock support Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 15/16] net/mlx5: provide the send scheduling error statistics Viacheslav Ovsiienko
2020-07-10  9:48 ` [dpdk-dev] [PATCH v1 16/16] common/mlx5: add register access DevX routine Viacheslav Ovsiienko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).