From: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
To: <dev@dpdk.org>
Cc: <matan@nvidia.com>, <rasland@nvidia.com>, <orika@nvidia.com>,
<stable@dpdk.org>
Subject: [PATCH] net/mlx5: fix read device clock in real time mode
Date: Tue, 3 Jan 2023 13:11:45 +0200 [thread overview]
Message-ID: <20230103111145.29824-1-viacheslavo@nvidia.com> (raw)
Since ConnectX-6DX the real time timestamp mode is supported.
The rte_eth_read_clock() routine queries current timestamp
value from the PMD.
The mlx5 PMD has special infrastructure to schedule packet
sending in real time mode which can be engaged with tx_pp devarg.
This infrastructure provides the timestamp reading from the special
queue CEQs directly from the host memory in user space, without
involving kernel calls.
The ConnectX-7 NIC has hardware capability to schedule packet
sending without special infrastructure and tx_pp devarg can be
omitted. If there is no tx_pp devarg specified the mlx5 uses kernel
calls to query current timestamp value. The kernel can be completely
unaware about engaged real time mode, also kernel might use its
internal queue CQEs to get timestamps, that is neither precise nor
reliable, inconsistent values might be returned, causing send
scheduling malfunction.
The HCA PCI BAR provides the real time direct reading from hardware.
This patch maps PCI resource to the process address space on demand
and allows reading the real time timestamp values from the NIC
directly.
Fixes: b94d93ca73803 ("net/mlx5: support reading device clock")
Cc: stable@dpdk.org
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
drivers/common/mlx5/mlx5_common.h | 1 +
drivers/common/mlx5/mlx5_prm.h | 5 +-
drivers/common/mlx5/version.map | 1 +
drivers/net/mlx5/linux/mlx5_ethdev_os.c | 68 +++++++++++++++++++++++
drivers/net/mlx5/mlx5.c | 6 +-
drivers/net/mlx5/mlx5.h | 4 ++
drivers/net/mlx5/mlx5_txpp.c | 15 ++++-
drivers/net/mlx5/windows/mlx5_ethdev_os.c | 30 ++++++++++
8 files changed, 127 insertions(+), 3 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_common.h b/drivers/common/mlx5/mlx5_common.h
index d6e91b5296..c7bd703497 100644
--- a/drivers/common/mlx5/mlx5_common.h
+++ b/drivers/common/mlx5/mlx5_common.h
@@ -221,6 +221,7 @@ check_cqe(volatile struct mlx5_cqe *cqe, const uint16_t cqes_n,
* - 0 on success.
* - Negative value and rte_errno is set otherwise.
*/
+__rte_internal
int mlx5_dev_to_pci_str(const struct rte_device *dev, char *addr, size_t size);
/*
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index 2b5c43ee6e..91ef61a06c 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -3040,6 +3040,7 @@ struct mlx5_ifc_health_buffer_bits {
u8 ext_synd[0x10];
};
+/* HCA PCI BAR resource structure. */
struct mlx5_ifc_initial_seg_bits {
u8 fw_rev_minor[0x10];
u8 fw_rev_major[0x10];
@@ -3067,7 +3068,9 @@ struct mlx5_ifc_initial_seg_bits {
u8 clear_int[0x1];
u8 health_syndrome[0x8];
u8 health_counter[0x18];
- u8 reserved_8[0x17fc0];
+ u8 reserved_8[0x160];
+ u8 real_time[0x40];
+ u8 reserved_9[0x17e20];
};
struct mlx5_ifc_create_cq_out_bits {
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index 4f72900519..03c8ce5593 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -14,6 +14,7 @@ INTERNAL {
mlx5_dev_is_pci;
mlx5_dev_is_vf_pci;
+ mlx5_dev_to_pci_str;
mlx5_dev_mempool_unregister;
mlx5_dev_mempool_subscribe;
diff --git a/drivers/net/mlx5/linux/mlx5_ethdev_os.c b/drivers/net/mlx5/linux/mlx5_ethdev_os.c
index 72268c0c8a..f1ff6f49f9 100644
--- a/drivers/net/mlx5/linux/mlx5_ethdev_os.c
+++ b/drivers/net/mlx5/linux/mlx5_ethdev_os.c
@@ -28,6 +28,7 @@
#include <bus_pci_driver.h>
#include <rte_mbuf.h>
#include <rte_common.h>
+#include <rte_eal_paging.h>
#include <rte_interrupts.h>
#include <rte_malloc.h>
#include <rte_string_fns.h>
@@ -1776,3 +1777,70 @@ int mlx5_get_flag_dropless_rq(struct rte_eth_dev *dev)
mlx5_free(sset_info);
return ret;
}
+
+/**
+ * Unmaps HCA PCI BAR from the current process address space.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev)
+{
+ struct mlx5_proc_priv *ppriv = dev->process_private;
+
+ if (ppriv && ppriv->hca_bar) {
+ rte_mem_unmap(ppriv->hca_bar, MLX5_ST_SZ_BYTES(initial_seg));
+ ppriv->hca_bar = NULL;
+ }
+}
+
+/**
+ * Maps HCA PCI BAR to the current process address space.
+ * Stores pointer in the process private structure allowing
+ * to read internal and real time counter directly from the HW.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success and not NULL pointer to mapped area in process structure.
+ * negative otherwise and NULL pointer
+ */
+int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev)
+{
+ struct mlx5_proc_priv *ppriv = dev->process_private;
+ char pci_addr[PCI_PRI_STR_SIZE] = { 0 };
+ void *base, *expected = NULL;
+ int fd, ret;
+
+ if (!ppriv) {
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ if (ppriv->hca_bar)
+ return 0;
+ ret = mlx5_dev_to_pci_str(dev->device, pci_addr, sizeof(pci_addr));
+ if (ret < 0)
+ return -rte_errno;
+ /* Open PCI device resource 0 - HCA initialize segment */
+ MKSTR(name, "/sys/bus/pci/devices/%s/resource0", pci_addr);
+ fd = open(name, O_RDWR | O_SYNC);
+ if (fd == -1) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ base = rte_mem_map(NULL, MLX5_ST_SZ_BYTES(initial_seg),
+ RTE_PROT_READ, RTE_MAP_SHARED, fd, 0);
+ close(fd);
+ if (!base) {
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+ }
+ /* Check there is no concurrent mapping in other thread. */
+ if (!__atomic_compare_exchange_n(&ppriv->hca_bar, &expected,
+ base, false,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED))
+ rte_mem_unmap(base, MLX5_ST_SZ_BYTES(initial_seg));
+ return 0;
+}
+
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 3ae35587b6..b8643cebdd 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1977,8 +1977,12 @@ mlx5_proc_priv_init(struct rte_eth_dev *dev)
void
mlx5_proc_priv_uninit(struct rte_eth_dev *dev)
{
- if (!dev->process_private)
+ struct mlx5_proc_priv *ppriv = dev->process_private;
+
+ if (!ppriv)
return;
+ if (ppriv->hca_bar)
+ mlx5_txpp_unmap_hca_bar(dev);
mlx5_free(dev->process_private);
dev->process_private = NULL;
}
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 31982002ee..16b33e1548 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1463,6 +1463,8 @@ struct mlx5_dev_ctx_shared {
* Caution, secondary process may rebuild the struct during port start.
*/
struct mlx5_proc_priv {
+ void *hca_bar;
+ /* Mapped HCA PCI BAR area. */
size_t uar_table_sz;
/* Size of UAR register table. */
struct mlx5_uar_data uar_table[];
@@ -2163,6 +2165,8 @@ int mlx5_txpp_xstats_get_names(struct rte_eth_dev *dev,
struct rte_eth_xstat_name *xstats_names,
unsigned int n, unsigned int n_used);
void mlx5_txpp_interrupt_handler(void *cb_arg);
+int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev);
+void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev);
/* mlx5_rxtx.c */
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index f853a67f58..63d98dbde9 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -969,6 +969,8 @@ mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
{
struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_dev_ctx_shared *sh = priv->sh;
+ struct mlx5_proc_priv *ppriv;
+ uint64_t ts;
int ret;
if (sh->txpp.refcnt) {
@@ -979,7 +981,6 @@ mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
rte_int128_t u128;
struct mlx5_cqe_ts cts;
} to;
- uint64_t ts;
mlx5_atomic_read_cqe((rte_int128_t *)&cqe->timestamp, &to.u128);
if (to.cts.op_own >> 4) {
@@ -994,6 +995,18 @@ mlx5_txpp_read_clock(struct rte_eth_dev *dev, uint64_t *timestamp)
*timestamp = ts;
return 0;
}
+ /* Check and try to map HCA PIC BAR to allow reading real time. */
+ ppriv = dev->process_private;
+ if (ppriv && !ppriv->hca_bar &&
+ sh->dev_cap.rt_timestamp && mlx5_dev_is_pci(dev->device))
+ mlx5_txpp_map_hca_bar(dev);
+ /* Check if we can read timestamp directly from hardware. */
+ if (ppriv && ppriv->hca_bar) {
+ ts = MLX5_GET64(initial_seg, ppriv->hca_bar, real_time);
+ ts = mlx5_txpp_convert_rx_ts(sh, ts);
+ *timestamp = ts;
+ return 0;
+ }
/* Not supported in isolated mode - kernel does not see the CQEs. */
if (priv->isolated || rte_eal_process_type() != RTE_PROC_PRIMARY)
return -ENOTSUP;
diff --git a/drivers/net/mlx5/windows/mlx5_ethdev_os.c b/drivers/net/mlx5/windows/mlx5_ethdev_os.c
index 88d8213f55..a31e1b5494 100644
--- a/drivers/net/mlx5/windows/mlx5_ethdev_os.c
+++ b/drivers/net/mlx5/windows/mlx5_ethdev_os.c
@@ -416,3 +416,33 @@ int mlx5_get_flag_dropless_rq(struct rte_eth_dev *dev)
RTE_SET_USED(dev);
return -ENOTSUP;
}
+
+/**
+ * Unmaps HCA PCI BAR from the current process address space.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ */
+void mlx5_txpp_unmap_hca_bar(struct rte_eth_dev *dev)
+{
+ RTE_SET_USED(dev);
+}
+
+/**
+ * Maps HCA PCI BAR to the current process address space.
+ * Stores pointer in the process private structure allowing
+ * to read internal and real time counter directly from the HW.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success and not NULL pointer to mapped area in process structure.
+ * negative otherwise and NULL pointer
+ */
+int mlx5_txpp_map_hca_bar(struct rte_eth_dev *dev)
+{
+ RTE_SET_USED(dev);
+ rte_errno = ENOTSUP;
+ return -ENOTSUP;
+}
--
2.18.1
next reply other threads:[~2023-01-03 11:12 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-01-03 11:11 Viacheslav Ovsiienko [this message]
2023-01-08 12:59 ` Raslan Darawsheh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230103111145.29824-1-viacheslavo@nvidia.com \
--to=viacheslavo@nvidia.com \
--cc=dev@dpdk.org \
--cc=matan@nvidia.com \
--cc=orika@nvidia.com \
--cc=rasland@nvidia.com \
--cc=stable@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).