DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe
@ 2020-10-06 11:38 Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 1/6] net/mlx5: locate aging pools in the general container Suanming Mou
                   ` (6 more replies)
  0 siblings, 7 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-06 11:38 UTC (permalink / raw)
  To: viacheslavo, matan; +Cc: rasland, dev

The mlx5 PMD is going to support multiple-thread flow operations.
This patch set makes the counter action to be thread safe.

Suanming Mou (6):
  net/mlx5: locate aging pools in the general container
  net/mlx5: optimize shared counter memory
  net/mlx5: remove single counter container
  net/mlx5: synchronize flow counter pool creation
  net/mlx5: make three level table thread safe
  net/mlx5: make shared counters thread safe

 drivers/net/mlx5/linux/mlx5_os.c   |  36 ++-
 drivers/net/mlx5/mlx5.c            |  38 ++-
 drivers/net/mlx5/mlx5.h            |  81 +++---
 drivers/net/mlx5/mlx5_flow.c       | 266 +++++++++++++-----
 drivers/net/mlx5/mlx5_flow.h       |   2 +
 drivers/net/mlx5/mlx5_flow_dv.c    | 555 ++++++++++---------------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |  78 +++---
 drivers/net/mlx5/mlx5_utils.c      | 144 +++++++---
 drivers/net/mlx5/mlx5_utils.h      |  52 ++--
 9 files changed, 618 insertions(+), 634 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 1/6] net/mlx5: locate aging pools in the general container
  2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
@ 2020-10-06 11:38 ` Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 2/6] net/mlx5: optimize shared counter memory Suanming Mou
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-06 11:38 UTC (permalink / raw)
  To: viacheslavo, matan; +Cc: rasland, dev

Commit [1] introduced different container for the aging counter
pools. In order to save container memory the aging counter pools
can be located in the general pool container.

This patch locates the aging counter pools in the general pool
container. Remove the aging container management.

[1] commit fd143711a6ea ("net/mlx5: separate aging counter pool range")

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/net/mlx5/mlx5.c            |  7 ++--
 drivers/net/mlx5/mlx5.h            | 17 +++++----
 drivers/net/mlx5/mlx5_flow.c       | 19 +++-------
 drivers/net/mlx5/mlx5_flow_dv.c    | 78 ++++++++++++++++++--------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |  4 +-
 5 files changed, 57 insertions(+), 68 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 01ead6e..5e3569d 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -458,7 +458,7 @@ struct mlx5_flow_id_pool *
 static void
 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
 {
-	int i;
+	int i, j;
 
 	memset(&sh->cmng, 0, sizeof(sh->cmng));
 	TAILQ_INIT(&sh->cmng.flow_counters);
@@ -468,7 +468,8 @@ struct mlx5_flow_id_pool *
 		sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
 		TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
 		rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
-		TAILQ_INIT(&sh->cmng.ccont[i].counters);
+		for (j = 0; j < MLX5_COUNTER_TYPE_MAX; j++)
+			TAILQ_INIT(&sh->cmng.ccont[i].counters[j]);
 		rte_spinlock_init(&sh->cmng.ccont[i].csl);
 	}
 }
@@ -513,7 +514,7 @@ struct mlx5_flow_id_pool *
 	}
 	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
 		struct mlx5_flow_counter_pool *pool;
-		uint32_t batch = !!(i > 1);
+		uint32_t batch = (i == MLX5_CCONT_TYPE_BATCH);
 
 		if (!sh->cmng.ccont[i].pools)
 			continue;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index bd91e16..27c8f45 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -270,7 +270,6 @@ struct mlx5_drop {
 #define MLX5_COUNTERS_PER_POOL 512
 #define MLX5_MAX_PENDING_QUERIES 4
 #define MLX5_CNT_CONTAINER_RESIZE 64
-#define MLX5_CNT_AGE_OFFSET 0x80000000
 #define CNT_SIZE (sizeof(struct mlx5_flow_counter))
 #define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
 #define AGE_SIZE (sizeof(struct mlx5_age_param))
@@ -279,7 +278,6 @@ struct mlx5_drop {
 #define CNT_POOL_TYPE_AGE	(1 << 1)
 #define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
 #define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
-#define MLX_CNT_IS_AGE(counter) ((counter) & MLX5_CNT_AGE_OFFSET ? 1 : 0)
 #define MLX5_CNT_LEN(pool) \
 	(CNT_SIZE + \
 	(IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
@@ -322,17 +320,20 @@ enum {
 	AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */
 };
 
-#define MLX5_CNT_CONTAINER(sh, batch, age) (&(sh)->cmng.ccont \
-					    [(batch) * 2 + (age)])
+#define MLX5_CNT_CONTAINER(sh, batch) (&(sh)->cmng.ccont[batch])
 
 enum {
 	MLX5_CCONT_TYPE_SINGLE,
-	MLX5_CCONT_TYPE_SINGLE_FOR_AGE,
 	MLX5_CCONT_TYPE_BATCH,
-	MLX5_CCONT_TYPE_BATCH_FOR_AGE,
 	MLX5_CCONT_TYPE_MAX,
 };
 
+enum mlx5_counter_type {
+	MLX5_COUNTER_TYPE_ORIGIN,
+	MLX5_COUNTER_TYPE_AGE,
+	MLX5_COUNTER_TYPE_MAX,
+};
+
 /* Counter age parameter. */
 struct mlx5_age_param {
 	rte_atomic16_t state; /**< Age state. */
@@ -427,7 +428,8 @@ struct mlx5_pools_container {
 	int max_id; /* The maximum counter ID in the pools. */
 	rte_spinlock_t resize_sl; /* The resize lock. */
 	rte_spinlock_t csl; /* The counter free list lock. */
-	struct mlx5_counters counters; /* Free counter list. */
+	struct mlx5_counters counters[MLX5_COUNTER_TYPE_MAX];
+	/* Free counter list. */
 	struct mlx5_counter_pools pool_list; /* Counter pool list. */
 	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
 	struct mlx5_counter_stats_mem_mng *mem_mng;
@@ -441,7 +443,6 @@ struct mlx5_flow_counter_mng {
 	uint8_t pending_queries;
 	uint8_t batch;
 	uint16_t pool_index;
-	uint8_t age;
 	uint8_t query_thread_on;
 	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
 	LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index ffa7646..db7fc8f 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -5940,7 +5940,6 @@ struct mlx5_meter_domains_infos *
 	uint16_t offset;
 	int ret;
 	uint8_t batch = sh->cmng.batch;
-	uint8_t age = sh->cmng.age;
 	uint16_t pool_index = sh->cmng.pool_index;
 	struct mlx5_pools_container *cont;
 	struct mlx5_flow_counter_pool *pool;
@@ -5949,7 +5948,7 @@ struct mlx5_meter_domains_infos *
 	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
 		goto set_alarm;
 next_container:
-	cont = MLX5_CNT_CONTAINER(sh, batch, age);
+	cont = MLX5_CNT_CONTAINER(sh, batch);
 	rte_spinlock_lock(&cont->resize_sl);
 	if (!cont->pools) {
 		rte_spinlock_unlock(&cont->resize_sl);
@@ -5958,11 +5957,6 @@ struct mlx5_meter_domains_infos *
 			goto set_alarm;
 		batch ^= 0x1;
 		pool_index = 0;
-		if (batch == 0 && pool_index == 0) {
-			age ^= 0x1;
-			sh->cmng.batch = batch;
-			sh->cmng.age = age;
-		}
 		goto next_container;
 	}
 	pool = cont->pools[pool_index];
@@ -6011,13 +6005,10 @@ struct mlx5_meter_domains_infos *
 	if (pool_index >= rte_atomic16_read(&cont->n_valid)) {
 		batch ^= 0x1;
 		pool_index = 0;
-		if (batch == 0 && pool_index == 0)
-			age ^= 0x1;
 	}
 set_alarm:
 	sh->cmng.batch = batch;
 	sh->cmng.pool_index = pool_index;
-	sh->cmng.age = age;
 	mlx5_set_query_alarm(sh);
 }
 
@@ -6103,10 +6094,12 @@ struct mlx5_meter_domains_infos *
 	struct mlx5_flow_counter_pool *pool =
 		(struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
 	struct mlx5_counter_stats_raw *raw_to_free;
-	uint8_t age = !!IS_AGE_POOL(pool);
 	uint8_t query_gen = pool->query_gen ^ 1;
 	struct mlx5_pools_container *cont =
-		MLX5_CNT_CONTAINER(sh, !IS_EXT_POOL(pool), age);
+		MLX5_CNT_CONTAINER(sh, !IS_EXT_POOL(pool));
+	enum mlx5_counter_type cnt_type =
+		IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
+				    MLX5_COUNTER_TYPE_ORIGIN;
 
 	if (unlikely(status)) {
 		raw_to_free = pool->raw_hw;
@@ -6121,7 +6114,7 @@ struct mlx5_meter_domains_infos *
 		rte_io_wmb();
 		if (!TAILQ_EMPTY(&pool->counters[query_gen])) {
 			rte_spinlock_lock(&cont->csl);
-			TAILQ_CONCAT(&cont->counters,
+			TAILQ_CONCAT(&cont->counters[cnt_type],
 				     &pool->counters[query_gen], next);
 			rte_spinlock_unlock(&cont->csl);
 		}
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 79fdf34..1bd3899 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4170,16 +4170,14 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_pools_container *cont;
 	struct mlx5_flow_counter_pool *pool;
-	uint32_t batch = 0, age = 0;
+	uint32_t batch = 0;
 
 	idx--;
-	age = MLX_CNT_IS_AGE(idx);
-	idx = age ? idx - MLX5_CNT_AGE_OFFSET : idx;
 	if (idx >= MLX5_CNT_BATCH_OFFSET) {
 		idx -= MLX5_CNT_BATCH_OFFSET;
 		batch = 1;
 	}
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cont->n);
 	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
@@ -4332,19 +4330,15 @@ struct field_modify_info modify_tcp[] = {
  *   Pointer to the Ethernet device structure.
  * @param[in] batch
  *   Whether the pool is for counter that was allocated by batch command.
- * @param[in] age
- *   Whether the pool is for Aging counter.
  *
  * @return
  *   0 on success, otherwise negative errno value and rte_errno is set.
  */
 static int
-flow_dv_container_resize(struct rte_eth_dev *dev,
-				uint32_t batch, uint32_t age)
+flow_dv_container_resize(struct rte_eth_dev *dev, uint32_t batch)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
-							       age);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
 	void *old_pools = cont->pools;
 	uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE;
@@ -4462,12 +4456,11 @@ struct field_modify_info modify_tcp[] = {
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
-							       age);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	int16_t n_valid = rte_atomic16_read(&cont->n_valid);
 	uint32_t size = sizeof(*pool);
 
-	if (cont->n == n_valid && flow_dv_container_resize(dev, batch, age))
+	if (cont->n == n_valid && flow_dv_container_resize(dev, batch))
 		return NULL;
 	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
 	size += (batch ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
@@ -4595,10 +4588,12 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_devx_obj *last_min_dcs;
 	struct mlx5_devx_obj *dcs = NULL;
 	struct mlx5_flow_counter *cnt;
+	enum mlx5_counter_type cnt_type =
+			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
 	uint32_t add2other;
 	uint32_t i;
 
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	if (!batch) {
 retry:
 		add2other = 0;
@@ -4607,24 +4602,19 @@ struct field_modify_info modify_tcp[] = {
 		if (!dcs)
 			return NULL;
 		pool = flow_dv_find_pool_by_id(cont, dcs->id);
-		/* Check if counter belongs to exist pool ID range. */
-		if (!pool) {
-			pool = flow_dv_find_pool_by_id
-			       (MLX5_CNT_CONTAINER
-			       (priv->sh, batch, (age ^ 0x1)), dcs->id);
-			/*
-			 * Pool eixsts, counter will be added to the other
-			 * container, need to reallocate it later.
-			 */
-			if (pool) {
-				add2other = 1;
-			} else {
-				pool = flow_dv_pool_create(dev, dcs, batch,
-							   age);
-				if (!pool) {
-					mlx5_devx_cmd_destroy(dcs);
-					return NULL;
-				}
+		/*
+		 * If pool eixsts but with other type, counter will be added
+		 * to the other pool, need to reallocate new counter in the
+		 * ragne with same type later.
+		 */
+		if (pool && ((!!IS_AGE_POOL(pool)) != age)) {
+			add2other = 1;
+		} else if (!pool) {
+			pool = flow_dv_pool_create(dev, dcs, batch,
+						   age);
+			if (!pool) {
+				mlx5_devx_cmd_destroy(dcs);
+				return NULL;
 			}
 		}
 		if ((dcs->id < pool->min_dcs->id ||
@@ -4692,7 +4682,7 @@ struct field_modify_info modify_tcp[] = {
 		TAILQ_INSERT_HEAD(&tmp_tq, cnt, next);
 	}
 	rte_spinlock_lock(&cont->csl);
-	TAILQ_CONCAT(&cont->counters, &tmp_tq, next);
+	TAILQ_CONCAT(&cont->counters[cnt_type], &tmp_tq, next);
 	rte_spinlock_unlock(&cont->csl);
 	*cnt_free = MLX5_POOL_GET_CNT(pool, 0);
 	(*cnt_free)->pool = pool;
@@ -4765,8 +4755,9 @@ struct field_modify_info modify_tcp[] = {
 	 * shared counters from the single container.
 	 */
 	uint32_t batch = (group && !shared && !priv->counter_fallback) ? 1 : 0;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
-							       age);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
+	enum mlx5_counter_type cnt_type =
+			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
 	uint32_t cnt_idx;
 
 	if (!priv->config.devx) {
@@ -4789,9 +4780,9 @@ struct field_modify_info modify_tcp[] = {
 	}
 	/* Get free counters from container. */
 	rte_spinlock_lock(&cont->csl);
-	cnt_free = TAILQ_FIRST(&cont->counters);
+	cnt_free = TAILQ_FIRST(&cont->counters[cnt_type]);
 	if (cnt_free)
-		TAILQ_REMOVE(&cont->counters, cnt_free, next);
+		TAILQ_REMOVE(&cont->counters[cnt_type], cnt_free, next);
 	rte_spinlock_unlock(&cont->csl);
 	if (!cnt_free && !flow_dv_counter_pool_prepare(dev, &cnt_free,
 						       batch, age))
@@ -4822,7 +4813,6 @@ struct field_modify_info modify_tcp[] = {
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool->index,
 				MLX5_CNT_ARRAY_IDX(pool, cnt_free));
 	cnt_idx += batch * MLX5_CNT_BATCH_OFFSET;
-	cnt_idx += age * MLX5_CNT_AGE_OFFSET;
 	/* Update the counter reset values. */
 	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
 				 &cnt_free->bytes))
@@ -4847,7 +4837,7 @@ struct field_modify_info modify_tcp[] = {
 	if (cnt_free) {
 		cnt_free->pool = pool;
 		rte_spinlock_lock(&cont->csl);
-		TAILQ_INSERT_TAIL(&cont->counters, cnt_free, next);
+		TAILQ_INSERT_TAIL(&cont->counters[cnt_type], cnt_free, next);
 		rte_spinlock_unlock(&cont->csl);
 	}
 	return 0;
@@ -4926,6 +4916,7 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
+	enum mlx5_counter_type cnt_type;
 
 	if (!counter)
 		return;
@@ -4954,12 +4945,15 @@ struct field_modify_info modify_tcp[] = {
 	 * function both operate with the different list.
 	 *
 	 */
-	if (!priv->counter_fallback)
+	if (!priv->counter_fallback) {
 		TAILQ_INSERT_TAIL(&pool->counters[pool->query_gen], cnt, next);
-	else
+	} else {
+		cnt_type = IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
+					       MLX5_COUNTER_TYPE_ORIGIN;
 		TAILQ_INSERT_TAIL(&((MLX5_CNT_CONTAINER
-				  (priv->sh, 0, 0))->counters),
+				  (priv->sh, 0))->counters[cnt_type]),
 				  cnt, next);
+	}
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 62c18b8..2f3035a 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -159,7 +159,7 @@
 			      struct mlx5_flow_counter_pool **ppool)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
 	struct mlx5_flow_counter_pool *pool;
 
 	idx--;
@@ -254,7 +254,7 @@
 flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 2/6] net/mlx5: optimize shared counter memory
  2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 1/6] net/mlx5: locate aging pools in the general container Suanming Mou
@ 2020-10-06 11:38 ` Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 3/6] net/mlx5: remove single counter container Suanming Mou
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-06 11:38 UTC (permalink / raw)
  To: viacheslavo, matan; +Cc: rasland, dev

Currently, when a counter is allocated, the counter list entry memory
will not be used until the counter is released and added back to the
counter container free list. In this case, if a counter is allocated
as shared counter, the shared information can be save to the counter
list entry memory. This adjustment engages to save more memory for
the shared counter.

One more thing is that now the shared counter is only available for
single counter as shared counter may be applied to both root and
none root table, but batch counter with offset is not supported in
root table in some old OFED version. The batch counter with offset
is now fully supported in the current OFED. This commit is also
the initialize change for the batch counter with offeset can be
shared in the later change.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/net/mlx5/mlx5.h            | 33 +++++++++-------
 drivers/net/mlx5/mlx5_flow_dv.c    | 78 +++++++++++++++-----------------------
 drivers/net/mlx5/mlx5_flow_verbs.c | 60 +++++++++++++++++------------
 3 files changed, 86 insertions(+), 85 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 27c8f45..fe6bd88 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -270,6 +270,10 @@ struct mlx5_drop {
 #define MLX5_COUNTERS_PER_POOL 512
 #define MLX5_MAX_PENDING_QUERIES 4
 #define MLX5_CNT_CONTAINER_RESIZE 64
+#define MLX5_CNT_SHARED_OFFSET 0x80000000
+#define IS_SHARED_CNT(cnt) (!!((cnt) & MLX5_CNT_SHARED_OFFSET))
+#define IS_BATCH_CNT(cnt) (((cnt) & (MLX5_CNT_SHARED_OFFSET - 1)) >= \
+			   MLX5_CNT_BATCH_OFFSET)
 #define CNT_SIZE (sizeof(struct mlx5_flow_counter))
 #define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
 #define AGE_SIZE (sizeof(struct mlx5_age_param))
@@ -348,11 +352,21 @@ struct flow_counter_stats {
 	uint64_t bytes;
 };
 
+/* Shared counters information for counters. */
+struct mlx5_flow_counter_shared {
+	uint32_t ref_cnt; /**< Reference counter. */
+	uint32_t id; /**< User counter ID. */
+};
+
 struct mlx5_flow_counter_pool;
 /* Generic counters information. */
 struct mlx5_flow_counter {
-	TAILQ_ENTRY(mlx5_flow_counter) next;
-	/**< Pointer to the next flow counter structure. */
+	union {
+		TAILQ_ENTRY(mlx5_flow_counter) next;
+		/**< Pointer to the next flow counter structure. */
+		struct mlx5_flow_counter_shared shared_info;
+		/**< Shared counter information. */
+	};
 	union {
 		uint64_t hits; /**< Reset value of hits packets. */
 		struct mlx5_flow_counter_pool *pool; /**< Counter pool. */
@@ -361,22 +375,15 @@ struct mlx5_flow_counter {
 	void *action; /**< Pointer to the dv action. */
 };
 
-/* Extend counters information for none batch counters. */
+/* Extend counters information for none batch fallback counters. */
 struct mlx5_flow_counter_ext {
-	uint32_t shared:1; /**< Share counter ID with other flow rules. */
-	uint32_t batch: 1;
 	uint32_t skipped:1; /* This counter is skipped or not. */
-	/**< Whether the counter was allocated by batch command. */
-	uint32_t ref_cnt:29; /**< Reference counter. */
-	uint32_t id; /**< User counter ID. */
-	union {  /**< Holds the counters for the rule. */
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
-		struct ibv_counter_set *cs;
+	struct ibv_counter_set *cs;
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
-		struct ibv_counters *cs;
+	struct ibv_counters *cs;
 #endif
-		struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
-	};
+	struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
 };
 
 TAILQ_HEAD(mlx5_counters, mlx5_flow_counter);
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 1bd3899..10be990 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4172,8 +4172,9 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_flow_counter_pool *pool;
 	uint32_t batch = 0;
 
-	idx--;
-	if (idx >= MLX5_CNT_BATCH_OFFSET) {
+	/* Decrease to original index and clear shared bit. */
+	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
+	if (IS_BATCH_CNT(idx)) {
 		idx -= MLX5_CNT_BATCH_OFFSET;
 		batch = 1;
 	}
@@ -4408,7 +4409,7 @@ struct field_modify_info modify_tcp[] = {
 
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (counter < MLX5_CNT_BATCH_OFFSET) {
+	if (!IS_BATCH_CNT(counter)) {
 		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
 		if (priv->counter_fallback)
 			return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
@@ -4696,29 +4697,19 @@ struct field_modify_info modify_tcp[] = {
  *   Pointer to the Ethernet device structure.
  * @param[in] id
  *   The shared counter ID to search.
- * @param[out] ppool
- *   mlx5 flow counter pool in the container,
  *
  * @return
- *   NULL if not existed, otherwise pointer to the shared extend counter.
+ *   0 if not existed, otherwise shared counter index.
  */
-static struct mlx5_flow_counter_ext *
-flow_dv_counter_shared_search(struct rte_eth_dev *dev, uint32_t id,
-			      struct mlx5_flow_counter_pool **ppool)
+static uint32_t
+flow_dv_counter_shared_search(struct rte_eth_dev *dev, uint32_t id)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	union mlx5_l3t_data data;
-	uint32_t cnt_idx;
 
-	if (mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data) || !data.dword)
-		return NULL;
-	cnt_idx = data.dword;
-	/*
-	 * Shared counters don't have age info. The counter extend is after
-	 * the counter datat structure.
-	 */
-	return (struct mlx5_flow_counter_ext *)
-	       ((flow_dv_counter_get_by_idx(dev, cnt_idx, ppool)) + 1);
+	if (mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data))
+		return 0;
+	return data.dword;
 }
 
 /**
@@ -4765,16 +4756,15 @@ struct field_modify_info modify_tcp[] = {
 		return 0;
 	}
 	if (shared) {
-		cnt_ext = flow_dv_counter_shared_search(dev, id, &pool);
-		if (cnt_ext) {
-			if (cnt_ext->ref_cnt + 1 == 0) {
+		cnt_idx = flow_dv_counter_shared_search(dev, id);
+		if (cnt_idx) {
+			cnt_free = flow_dv_counter_get_by_idx(dev, cnt_idx,
+							      NULL);
+			if (cnt_free->shared_info.ref_cnt + 1 == 0) {
 				rte_errno = E2BIG;
 				return 0;
 			}
-			cnt_ext->ref_cnt++;
-			cnt_idx = pool->index * MLX5_COUNTERS_PER_POOL +
-				  (cnt_ext->dcs->id % MLX5_COUNTERS_PER_POOL)
-				  + 1;
+			cnt_free->shared_info.ref_cnt++;
 			return cnt_idx;
 		}
 	}
@@ -4817,17 +4807,15 @@ struct field_modify_info modify_tcp[] = {
 	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
 				 &cnt_free->bytes))
 		goto err;
-	if (cnt_ext) {
-		cnt_ext->shared = shared;
-		cnt_ext->ref_cnt = 1;
-		cnt_ext->id = id;
-		if (shared) {
-			union mlx5_l3t_data data;
-
-			data.dword = cnt_idx;
-			if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
-				return 0;
-		}
+	if (shared) {
+		union mlx5_l3t_data data;
+
+		data.dword = cnt_idx;
+		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
+			goto err;
+		cnt_free->shared_info.ref_cnt = 1;
+		cnt_free->shared_info.id = id;
+		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
 	if (!priv->counter_fallback && !priv->sh->cmng.query_thread_on)
 		/* Start the asynchronous batch query by the host thread. */
@@ -4915,22 +4903,18 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt;
-	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	enum mlx5_counter_type cnt_type;
 
 	if (!counter)
 		return;
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (counter < MLX5_CNT_BATCH_OFFSET) {
-		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-		if (cnt_ext) {
-			if (--cnt_ext->ref_cnt)
-				return;
-			if (cnt_ext->shared)
-				mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
-						     cnt_ext->id);
-		}
+
+	if (IS_SHARED_CNT(counter)) {
+		if (--cnt->shared_info.ref_cnt)
+			return;
+		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
+				     cnt->shared_info.id);
 	}
 	if (IS_AGE_POOL(pool))
 		flow_dv_counter_remove_from_age(dev, counter, cnt);
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 2f3035a..0463bea 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -162,7 +162,7 @@
 	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
 	struct mlx5_flow_counter_pool *pool;
 
-	idx--;
+	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
 	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
 	if (ppool)
@@ -258,22 +258,21 @@
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
+	union mlx5_l3t_data data;
 	uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
-	uint32_t pool_idx;
+	uint32_t pool_idx, cnt_idx;
 	uint32_t i;
 	int ret;
 
-	if (shared) {
-		for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
-			pool = cont->pools[pool_idx];
-			for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
-				cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
-				if (cnt_ext->shared && cnt_ext->id == id) {
-					cnt_ext->ref_cnt++;
-					return MLX5_MAKE_CNT_IDX(pool_idx, i);
-				}
-			}
+	if (shared && !mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data) &&
+	    data.dword) {
+		cnt = flow_verbs_counter_get_by_idx(dev, data.dword, NULL);
+		if (cnt->shared_info.ref_cnt + 1 == 0) {
+			rte_errno = E2BIG;
+			return 0;
 		}
+		cnt->shared_info.ref_cnt++;
+		return data.dword;
 	}
 	for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
 		pool = cont->pools[pool_idx];
@@ -322,17 +321,23 @@
 		TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
 	}
 	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
+	cnt_idx = MLX5_MAKE_CNT_IDX(pool_idx, i);
+	if (shared) {
+		data.dword = cnt_idx;
+		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
+			return 0;
+		cnt->shared_info.ref_cnt = 1;
+		cnt->shared_info.id = id;
+		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
+	}
 	cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
-	cnt_ext->id = id;
-	cnt_ext->shared = shared;
-	cnt_ext->ref_cnt = 1;
 	cnt->hits = 0;
 	cnt->bytes = 0;
 	/* Create counter with Verbs. */
 	ret = flow_verbs_counter_create(dev, cnt_ext);
 	if (!ret) {
 		TAILQ_REMOVE(&pool->counters[0], cnt, next);
-		return MLX5_MAKE_CNT_IDX(pool_idx, i);
+		return cnt_idx;
 	}
 	/* Some error occurred in Verbs library. */
 	rte_errno = -ret;
@@ -350,23 +355,28 @@
 static void
 flow_verbs_counter_release(struct rte_eth_dev *dev, uint32_t counter)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_flow_counter *cnt;
 	struct mlx5_flow_counter_ext *cnt_ext;
 
-	cnt = flow_verbs_counter_get_by_idx(dev, counter,
-					    &pool);
+	cnt = flow_verbs_counter_get_by_idx(dev, counter, &pool);
+	if (IS_SHARED_CNT(counter)) {
+		if (--cnt->shared_info.ref_cnt)
+			return;
+		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
+				     cnt->shared_info.id);
+	}
 	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-	if (--cnt_ext->ref_cnt == 0) {
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
-		claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
-		cnt_ext->cs = NULL;
+	claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
+	cnt_ext->cs = NULL;
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
-		claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs));
-		cnt_ext->cs = NULL;
+	claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs));
+	cnt_ext->cs = NULL;
 #endif
-		TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
-	}
+	(void)cnt_ext;
+	TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
 }
 
 /**
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 3/6] net/mlx5: remove single counter container
  2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 1/6] net/mlx5: locate aging pools in the general container Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 2/6] net/mlx5: optimize shared counter memory Suanming Mou
@ 2020-10-06 11:38 ` Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 4/6] net/mlx5: synchronize flow counter pool creation Suanming Mou
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-06 11:38 UTC (permalink / raw)
  To: viacheslavo, matan; +Cc: rasland, dev

A flow counter which was allocated by a batch API couldn't be assigned
to a flow in the root table (group 0) in old rdma-core version.
Hence, a root table flow counter required PMD mechanism to manage
counters which were allocated singly.

Currently, the batch counters have already been supported in root table
includes a new rdma-core version with MLX5_FLOW_ACTION_COUNTER_OFFSET
enum and with a kernel driver includes
MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET enum.

When the PMD uses rdma-core API to assign a batch counter to a root
table flow using invalid counter offset, it should get an error only
if the batch counter assignment for root table is supported.
Using this trial in the initialization time can help to detect the
support.

Using the above trial, if the support is valid, remove the management of
single counter container in the fast counter mechanism. Otherwise, move
the counter mechanism to fallback mode.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c   |  36 +++-
 drivers/net/mlx5/mlx5.c            |  39 ++---
 drivers/net/mlx5/mlx5.h            |  27 +--
 drivers/net/mlx5/mlx5_flow.c       | 152 ++++++++++-------
 drivers/net/mlx5/mlx5_flow.h       |   2 +
 drivers/net/mlx5/mlx5_flow_dv.c    | 337 +++++++++++++------------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |  26 +--
 7 files changed, 273 insertions(+), 346 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 188a6d4..43d173b 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -512,6 +512,32 @@
 }
 
 /**
+ * DV flow counter mode detect and config.
+ *
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
+ *
+ */
+static void
+mlx5_flow_counter_mode_config(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	/* If devx is not supported, counters are not working. */
+	if (!priv->config.devx)
+		return;
+	priv->counter_fallback = 0;
+	if (!priv->config.hca_attr.flow_counters_dump ||
+	    (mlx5_flow_discover_counter_offset_support(dev) == -ENOTSUP))
+		priv->counter_fallback = 1;
+#ifndef HAVE_IBV_DEVX_ASYNC
+	priv->counter_fallback = 1;
+#endif
+	if (priv->counter_fallback)
+		DRV_LOG(INFO, "Use fall-back DV counter management");
+}
+
+/**
  * Spawn an Ethernet device from Verbs information.
  *
  * @param dpdk_dev
@@ -979,19 +1005,11 @@
 		DRV_LOG(INFO, "Rx CQE padding is enabled");
 	}
 	if (config->devx) {
-		priv->counter_fallback = 0;
 		err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr);
 		if (err) {
 			err = -err;
 			goto error;
 		}
-		if (!config->hca_attr.flow_counters_dump)
-			priv->counter_fallback = 1;
-#ifndef HAVE_IBV_DEVX_ASYNC
-		priv->counter_fallback = 1;
-#endif
-		if (priv->counter_fallback)
-			DRV_LOG(INFO, "Use fall-back DV counter management");
 		/* Check for LRO support. */
 		if (config->dest_tir && config->hca_attr.lro_cap &&
 		    config->dv_flow_en) {
@@ -1364,6 +1382,8 @@
 			goto error;
 		}
 	}
+	if (priv->config.dv_flow_en)
+		mlx5_flow_counter_mode_config(eth_dev);
 	return eth_dev;
 error:
 	if (priv) {
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 5e3569d..96cebba 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -458,20 +458,18 @@ struct mlx5_flow_id_pool *
 static void
 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
 {
-	int i, j;
+	int i;
 
 	memset(&sh->cmng, 0, sizeof(sh->cmng));
 	TAILQ_INIT(&sh->cmng.flow_counters);
-	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
-		sh->cmng.ccont[i].min_id = MLX5_CNT_BATCH_OFFSET;
-		sh->cmng.ccont[i].max_id = -1;
-		sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
-		TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
-		rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
-		for (j = 0; j < MLX5_COUNTER_TYPE_MAX; j++)
-			TAILQ_INIT(&sh->cmng.ccont[i].counters[j]);
-		rte_spinlock_init(&sh->cmng.ccont[i].csl);
-	}
+	sh->cmng.min_id = MLX5_CNT_BATCH_OFFSET;
+	sh->cmng.max_id = -1;
+	sh->cmng.last_pool_idx = POOL_IDX_INVALID;
+	TAILQ_INIT(&sh->cmng.pool_list);
+	rte_spinlock_init(&sh->cmng.resize_sl);
+	for (i = 0; i < MLX5_COUNTER_TYPE_MAX; i++)
+		TAILQ_INIT(&sh->cmng.counters[i]);
+	rte_spinlock_init(&sh->cmng.csl);
 }
 
 /**
@@ -501,7 +499,6 @@ struct mlx5_flow_id_pool *
 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
 {
 	struct mlx5_counter_stats_mem_mng *mng;
-	int i;
 	int j;
 	int retries = 1024;
 
@@ -512,15 +509,13 @@ struct mlx5_flow_id_pool *
 			break;
 		rte_pause();
 	}
-	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
+
+	if (sh->cmng.pools) {
 		struct mlx5_flow_counter_pool *pool;
-		uint32_t batch = (i == MLX5_CCONT_TYPE_BATCH);
 
-		if (!sh->cmng.ccont[i].pools)
-			continue;
-		pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+		pool = TAILQ_FIRST(&sh->cmng.pool_list);
 		while (pool) {
-			if (batch && pool->min_dcs)
+			if (!IS_EXT_POOL(pool) && pool->min_dcs)
 				claim_zero(mlx5_devx_cmd_destroy
 							       (pool->min_dcs));
 			for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
@@ -529,17 +524,17 @@ struct mlx5_flow_id_pool *
 					 (mlx5_glue->destroy_flow_action
 					  (MLX5_POOL_GET_CNT
 					  (pool, j)->action));
-				if (!batch && MLX5_GET_POOL_CNT_EXT
+				if (IS_EXT_POOL(pool) && MLX5_GET_POOL_CNT_EXT
 				    (pool, j)->dcs)
 					claim_zero(mlx5_devx_cmd_destroy
 						   (MLX5_GET_POOL_CNT_EXT
 						    (pool, j)->dcs));
 			}
-			TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
+			TAILQ_REMOVE(&sh->cmng.pool_list, pool, next);
 			mlx5_free(pool);
-			pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+			pool = TAILQ_FIRST(&sh->cmng.pool_list);
 		}
-		mlx5_free(sh->cmng.ccont[i].pools);
+		mlx5_free(sh->cmng.pools);
 	}
 	mng = LIST_FIRST(&sh->cmng.mem_mngs);
 	while (mng) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index fe6bd88..a3d4ad9 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -280,8 +280,10 @@ struct mlx5_drop {
 #define MLX5_AGING_TIME_DELAY	7
 #define CNT_POOL_TYPE_EXT	(1 << 0)
 #define CNT_POOL_TYPE_AGE	(1 << 1)
+
 #define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
 #define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
+
 #define MLX5_CNT_LEN(pool) \
 	(CNT_SIZE + \
 	(IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
@@ -324,14 +326,6 @@ enum {
 	AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */
 };
 
-#define MLX5_CNT_CONTAINER(sh, batch) (&(sh)->cmng.ccont[batch])
-
-enum {
-	MLX5_CCONT_TYPE_SINGLE,
-	MLX5_CCONT_TYPE_BATCH,
-	MLX5_CCONT_TYPE_MAX,
-};
-
 enum mlx5_counter_type {
 	MLX5_COUNTER_TYPE_ORIGIN,
 	MLX5_COUNTER_TYPE_AGE,
@@ -377,7 +371,6 @@ struct mlx5_flow_counter {
 
 /* Extend counters information for none batch fallback counters. */
 struct mlx5_flow_counter_ext {
-	uint32_t skipped:1; /* This counter is skipped or not. */
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 	struct ibv_counter_set *cs;
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
@@ -397,9 +390,8 @@ struct mlx5_flow_counter_pool {
 		rte_atomic64_t a64_dcs;
 	};
 	/* The devx object of the minimum counter ID. */
-	uint32_t index:28; /* Pool index in container. */
+	uint32_t index:29; /* Pool index in container. */
 	uint32_t type:2; /* Memory type behind the counter array. */
-	uint32_t skip_cnt:1; /* Pool contains skipped counter. */
 	volatile uint32_t query_gen:1; /* Query round. */
 	rte_spinlock_t sl; /* The pool lock. */
 	struct mlx5_counter_stats_raw *raw;
@@ -419,15 +411,14 @@ struct mlx5_counter_stats_mem_mng {
 /* Raw memory structure for the counter statistics values of a pool. */
 struct mlx5_counter_stats_raw {
 	LIST_ENTRY(mlx5_counter_stats_raw) next;
-	int min_dcs_id;
 	struct mlx5_counter_stats_mem_mng *mem_mng;
 	volatile struct flow_counter_stats *data;
 };
 
 TAILQ_HEAD(mlx5_counter_pools, mlx5_flow_counter_pool);
 
-/* Container structure for counter pools. */
-struct mlx5_pools_container {
+/* Counter global management structure. */
+struct mlx5_flow_counter_mng {
 	rte_atomic16_t n_valid; /* Number of valid pools. */
 	uint16_t n; /* Number of pools. */
 	uint16_t last_pool_idx; /* Last used pool index */
@@ -441,14 +432,8 @@ struct mlx5_pools_container {
 	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
 	struct mlx5_counter_stats_mem_mng *mem_mng;
 	/* Hold the memory management for the next allocated pools raws. */
-};
-
-/* Counter global management structure. */
-struct mlx5_flow_counter_mng {
-	struct mlx5_pools_container ccont[MLX5_CCONT_TYPE_MAX];
 	struct mlx5_counters flow_counters; /* Legacy flow counter list. */
 	uint8_t pending_queries;
-	uint8_t batch;
 	uint16_t pool_index;
 	uint8_t query_thread_on;
 	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
@@ -838,6 +823,8 @@ struct mlx5_priv {
 	struct mlx5_flow_meters flow_meters; /* MTR list. */
 	uint8_t skip_default_rss_reta; /* Skip configuration of default reta. */
 	uint8_t fdb_def_rule; /* Whether fdb jump to table 1 is configured. */
+	void *cnt_action; /* Counter action to validate invalid offset. */
+	struct mlx5_devx_obj *cnt_dcs; /* Counter validate devx object. */
 	struct mlx5_mp_id mp_id; /* ID of a multi-process process */
 	LIST_HEAD(fdir, mlx5_fdir_flow) fdir_flows; /* fdir flows. */
 };
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index db7fc8f..c280f56 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -5883,26 +5883,6 @@ struct mlx5_meter_domains_infos *
 #define MLX5_POOL_QUERY_FREQ_US 1000000
 
 /**
- * Get number of all validate pools.
- *
- * @param[in] sh
- *   Pointer to mlx5_dev_ctx_shared object.
- *
- * @return
- *   The number of all validate pools.
- */
-static uint32_t
-mlx5_get_all_valid_pool_count(struct mlx5_dev_ctx_shared *sh)
-{
-	int i;
-	uint32_t pools_n = 0;
-
-	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i)
-		pools_n += rte_atomic16_read(&sh->cmng.ccont[i].n_valid);
-	return pools_n;
-}
-
-/**
  * Set the periodic procedure for triggering asynchronous batch queries for all
  * the counter pools.
  *
@@ -5914,7 +5894,7 @@ struct mlx5_meter_domains_infos *
 {
 	uint32_t pools_n, us;
 
-	pools_n = mlx5_get_all_valid_pool_count(sh);
+	pools_n = rte_atomic16_read(&sh->cmng.n_valid);
 	us = MLX5_POOL_QUERY_FREQ_US / pools_n;
 	DRV_LOG(DEBUG, "Set alarm for %u pools each %u us", pools_n, us);
 	if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) {
@@ -5936,31 +5916,20 @@ struct mlx5_meter_domains_infos *
 mlx5_flow_query_alarm(void *arg)
 {
 	struct mlx5_dev_ctx_shared *sh = arg;
-	struct mlx5_devx_obj *dcs;
-	uint16_t offset;
 	int ret;
-	uint8_t batch = sh->cmng.batch;
 	uint16_t pool_index = sh->cmng.pool_index;
-	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
-	int cont_loop = MLX5_CCONT_TYPE_MAX;
 
 	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
 		goto set_alarm;
-next_container:
-	cont = MLX5_CNT_CONTAINER(sh, batch);
-	rte_spinlock_lock(&cont->resize_sl);
-	if (!cont->pools) {
-		rte_spinlock_unlock(&cont->resize_sl);
-		/* Check if all the containers are empty. */
-		if (unlikely(--cont_loop == 0))
-			goto set_alarm;
-		batch ^= 0x1;
-		pool_index = 0;
-		goto next_container;
+	rte_spinlock_lock(&cmng->resize_sl);
+	if (!cmng->pools) {
+		rte_spinlock_unlock(&cmng->resize_sl);
+		goto set_alarm;
 	}
-	pool = cont->pools[pool_index];
-	rte_spinlock_unlock(&cont->resize_sl);
+	pool = cmng->pools[pool_index];
+	rte_spinlock_unlock(&cmng->resize_sl);
 	if (pool->raw_hw)
 		/* There is a pool query in progress. */
 		goto set_alarm;
@@ -5969,14 +5938,6 @@ struct mlx5_meter_domains_infos *
 	if (!pool->raw_hw)
 		/* No free counter statistics raw memory. */
 		goto set_alarm;
-	dcs = (struct mlx5_devx_obj *)(uintptr_t)rte_atomic64_read
-							      (&pool->a64_dcs);
-	if (dcs->id & (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1)) {
-		/* Pool without valid counter. */
-		pool->raw_hw = NULL;
-		goto next_pool;
-	}
-	offset = batch ? 0 : dcs->id % MLX5_COUNTERS_PER_POOL;
 	/*
 	 * Identify the counters released between query trigger and query
 	 * handle more effiecntly. The counter released in this gap period
@@ -5984,11 +5945,12 @@ struct mlx5_meter_domains_infos *
 	 * will not be taken into account.
 	 */
 	pool->query_gen++;
-	ret = mlx5_devx_cmd_flow_counter_query(dcs, 0, MLX5_COUNTERS_PER_POOL -
-					       offset, NULL, NULL,
+	ret = mlx5_devx_cmd_flow_counter_query(pool->min_dcs, 0,
+					       MLX5_COUNTERS_PER_POOL,
+					       NULL, NULL,
 					       pool->raw_hw->mem_mng->dm->id,
 					       (void *)(uintptr_t)
-					       (pool->raw_hw->data + offset),
+					       pool->raw_hw->data,
 					       sh->devx_comp,
 					       (uint64_t)(uintptr_t)pool);
 	if (ret) {
@@ -5997,17 +5959,12 @@ struct mlx5_meter_domains_infos *
 		pool->raw_hw = NULL;
 		goto set_alarm;
 	}
-	pool->raw_hw->min_dcs_id = dcs->id;
 	LIST_REMOVE(pool->raw_hw, next);
 	sh->cmng.pending_queries++;
-next_pool:
 	pool_index++;
-	if (pool_index >= rte_atomic16_read(&cont->n_valid)) {
-		batch ^= 0x1;
+	if (pool_index >= rte_atomic16_read(&cmng->n_valid))
 		pool_index = 0;
-	}
 set_alarm:
-	sh->cmng.batch = batch;
 	sh->cmng.pool_index = pool_index;
 	mlx5_set_query_alarm(sh);
 }
@@ -6095,8 +6052,7 @@ struct mlx5_meter_domains_infos *
 		(struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
 	struct mlx5_counter_stats_raw *raw_to_free;
 	uint8_t query_gen = pool->query_gen ^ 1;
-	struct mlx5_pools_container *cont =
-		MLX5_CNT_CONTAINER(sh, !IS_EXT_POOL(pool));
+	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
 	enum mlx5_counter_type cnt_type =
 		IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
 				    MLX5_COUNTER_TYPE_ORIGIN;
@@ -6113,10 +6069,10 @@ struct mlx5_meter_domains_infos *
 		/* Be sure the new raw counters data is updated in memory. */
 		rte_io_wmb();
 		if (!TAILQ_EMPTY(&pool->counters[query_gen])) {
-			rte_spinlock_lock(&cont->csl);
-			TAILQ_CONCAT(&cont->counters[cnt_type],
+			rte_spinlock_lock(&cmng->csl);
+			TAILQ_CONCAT(&cmng->counters[cnt_type],
 				     &pool->counters[query_gen], next);
-			rte_spinlock_unlock(&cont->csl);
+			rte_spinlock_unlock(&cmng->csl);
 		}
 	}
 	LIST_INSERT_HEAD(&sh->cmng.free_stat_raws, raw_to_free, next);
@@ -6238,6 +6194,80 @@ struct mlx5_meter_domains_infos *
 }
 
 /**
+ * Validate if batch counter supported in root table.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_discover_counter_offset_support(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow_attr attr = {
+		.group = 0,
+		.ingress = 1,
+	};
+	struct rte_flow_item items[] = {
+		[0] = {
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+	};
+	struct rte_flow_action actions[] = {
+		[0] = {
+			.type = (enum rte_flow_action_type)
+				MLX5_RTE_FLOW_ACTION_TYPE_COUNT,
+		},
+		[1] = {
+			.type = RTE_FLOW_ACTION_TYPE_JUMP,
+			.conf = &(struct rte_flow_action_jump){
+				.group = 1,
+			},
+		},
+		[2] = {
+			.type = RTE_FLOW_ACTION_TYPE_END,
+		},
+	};
+	int ret = 0;
+	uint32_t flow_idx;
+	struct rte_flow_error error;
+
+	flow_idx = flow_list_create(dev, NULL, &attr, items,
+				    actions, true, &error);
+	/*
+	 * If batch counter with offset is not supported, the driver will not
+	 * validate the invalid offset value, flow create will successfully.
+	 * In this case, it means batch counter is not supported in root table.
+	 *
+	 * Otherwise, if flow create failed with other cases, report error
+	 * message.
+	 */
+	if (flow_idx) {
+		flow_list_destroy(dev, NULL, flow_idx);
+		DRV_LOG(WARNING, "Batch counter is not supported in root "
+				 "table. Switch to fallback mode.");
+		rte_errno = ENOTSUP;
+		ret = -rte_errno;
+	} else {
+		if (errno != EINVAL)
+			DRV_LOG(ERR, "Counter may not work correctly as "
+				     "validate fail with unknown reason.");
+		ret = 0;
+	}
+	if (priv->cnt_action) {
+		mlx5_flow_os_destroy_flow_action(priv->cnt_action);
+		priv->cnt_action = NULL;
+	}
+	if (priv->cnt_dcs) {
+		mlx5_devx_cmd_destroy(priv->cnt_dcs);
+		priv->cnt_dcs = NULL;
+	}
+	return ret;
+}
+
+/**
  * Dump flow raw hw data to file
  *
  * @param[in] dev
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 279daf2..344634f 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -35,6 +35,7 @@ enum mlx5_rte_flow_action_type {
 	MLX5_RTE_FLOW_ACTION_TYPE_MARK,
 	MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
 	MLX5_RTE_FLOW_ACTION_TYPE_DEFAULT_MISS,
+	MLX5_RTE_FLOW_ACTION_TYPE_COUNT,
 };
 
 /* Matches on selected register. */
@@ -1069,4 +1070,5 @@ int mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev,
 				    const struct rte_flow_attr *attr);
 int mlx5_flow_meter_flush(struct rte_eth_dev *dev,
 			  struct rte_mtr_error *error);
+int mlx5_flow_discover_counter_offset_support(struct rte_eth_dev *dev);
 #endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 10be990..43d8ea8 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4168,19 +4168,13 @@ struct field_modify_info modify_tcp[] = {
 			   struct mlx5_flow_counter_pool **ppool)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
-	uint32_t batch = 0;
 
 	/* Decrease to original index and clear shared bit. */
 	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
-	if (IS_BATCH_CNT(idx)) {
-		idx -= MLX5_CNT_BATCH_OFFSET;
-		batch = 1;
-	}
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
-	MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cont->n);
-	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+	MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cmng->n);
+	pool = cmng->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
 	if (ppool)
 		*ppool = pool;
@@ -4212,8 +4206,8 @@ struct field_modify_info modify_tcp[] = {
 /**
  * Get a pool by devx counter ID.
  *
- * @param[in] cont
- *   Pointer to the counter container.
+ * @param[in] cmng
+ *   Pointer to the counter management.
  * @param[in] id
  *   The counter devx ID.
  *
@@ -4221,25 +4215,25 @@ struct field_modify_info modify_tcp[] = {
  *   The counter pool pointer if exists, NULL otherwise,
  */
 static struct mlx5_flow_counter_pool *
-flow_dv_find_pool_by_id(struct mlx5_pools_container *cont, int id)
+flow_dv_find_pool_by_id(struct mlx5_flow_counter_mng *cmng, int id)
 {
 	uint32_t i;
 
 	/* Check last used pool. */
-	if (cont->last_pool_idx != POOL_IDX_INVALID &&
-	    flow_dv_is_counter_in_pool(cont->pools[cont->last_pool_idx], id))
-		return cont->pools[cont->last_pool_idx];
+	if (cmng->last_pool_idx != POOL_IDX_INVALID &&
+	    flow_dv_is_counter_in_pool(cmng->pools[cmng->last_pool_idx], id))
+		return cmng->pools[cmng->last_pool_idx];
 	/* ID out of range means no suitable pool in the container. */
-	if (id > cont->max_id || id < cont->min_id)
+	if (id > cmng->max_id || id < cmng->min_id)
 		return NULL;
 	/*
 	 * Find the pool from the end of the container, since mostly counter
 	 * ID is sequence increasing, and the last pool should be the needed
 	 * one.
 	 */
-	i = rte_atomic16_read(&cont->n_valid);
+	i = rte_atomic16_read(&cmng->n_valid);
 	while (i--) {
-		struct mlx5_flow_counter_pool *pool = cont->pools[i];
+		struct mlx5_flow_counter_pool *pool = cmng->pools[i];
 
 		if (flow_dv_is_counter_in_pool(pool, id))
 			return pool;
@@ -4329,20 +4323,18 @@ struct field_modify_info modify_tcp[] = {
  *
  * @param[in] dev
  *   Pointer to the Ethernet device structure.
- * @param[in] batch
- *   Whether the pool is for counter that was allocated by batch command.
  *
  * @return
  *   0 on success, otherwise negative errno value and rte_errno is set.
  */
 static int
-flow_dv_container_resize(struct rte_eth_dev *dev, uint32_t batch)
+flow_dv_container_resize(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
-	void *old_pools = cont->pools;
-	uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE;
+	void *old_pools = cmng->pools;
+	uint32_t resize = cmng->n + MLX5_CNT_CONTAINER_RESIZE;
 	uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize;
 	void *pools = mlx5_malloc(MLX5_MEM_ZERO, mem_size, 0, SOCKET_ID_ANY);
 
@@ -4351,7 +4343,7 @@ struct field_modify_info modify_tcp[] = {
 		return -ENOMEM;
 	}
 	if (old_pools)
-		memcpy(pools, old_pools, cont->n *
+		memcpy(pools, old_pools, cmng->n *
 				       sizeof(struct mlx5_flow_counter_pool *));
 	/*
 	 * Fallback mode query the counter directly, no background query
@@ -4372,11 +4364,11 @@ struct field_modify_info modify_tcp[] = {
 					 MLX5_CNT_CONTAINER_RESIZE +
 					 i, next);
 	}
-	rte_spinlock_lock(&cont->resize_sl);
-	cont->n = resize;
-	cont->mem_mng = mem_mng;
-	cont->pools = pools;
-	rte_spinlock_unlock(&cont->resize_sl);
+	rte_spinlock_lock(&cmng->resize_sl);
+	cmng->n = resize;
+	cmng->mem_mng = mem_mng;
+	cmng->pools = pools;
+	rte_spinlock_unlock(&cmng->resize_sl);
 	if (old_pools)
 		mlx5_free(old_pools);
 	return 0;
@@ -4409,27 +4401,16 @@ struct field_modify_info modify_tcp[] = {
 
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (!IS_BATCH_CNT(counter)) {
+	if (priv->counter_fallback) {
 		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-		if (priv->counter_fallback)
-			return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
+		return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
 					0, pkts, bytes, 0, NULL, NULL, 0);
 	}
 
 	rte_spinlock_lock(&pool->sl);
-	/*
-	 * The single counters allocation may allocate smaller ID than the
-	 * current allocated in parallel to the host reading.
-	 * In this case the new counter values must be reported as 0.
-	 */
-	if (unlikely(cnt_ext && cnt_ext->dcs->id < pool->raw->min_dcs_id)) {
-		*pkts = 0;
-		*bytes = 0;
-	} else {
-		offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
-		*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
-		*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
-	}
+	offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
+	*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+	*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
 	rte_spinlock_unlock(&pool->sl);
 	return 0;
 }
@@ -4441,8 +4422,6 @@ struct field_modify_info modify_tcp[] = {
  *   Pointer to the Ethernet device structure.
  * @param[out] dcs
  *   The devX counter handle.
- * @param[in] batch
- *   Whether the pool is for counter that was allocated by batch command.
  * @param[in] age
  *   Whether the pool is for counter that was allocated for aging.
  * @param[in/out] cont_cur
@@ -4453,123 +4432,63 @@ struct field_modify_info modify_tcp[] = {
  */
 static struct mlx5_flow_counter_pool *
 flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs,
-		    uint32_t batch, uint32_t age)
+		    uint32_t age)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
-	int16_t n_valid = rte_atomic16_read(&cont->n_valid);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
+	int16_t n_valid = rte_atomic16_read(&cmng->n_valid);
+	uint32_t fallback = priv->counter_fallback;
 	uint32_t size = sizeof(*pool);
 
-	if (cont->n == n_valid && flow_dv_container_resize(dev, batch))
+	if (cmng->n == n_valid && flow_dv_container_resize(dev))
 		return NULL;
 	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
-	size += (batch ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
+	size += (!fallback ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
 	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE);
 	pool = mlx5_malloc(MLX5_MEM_ZERO, size, 0, SOCKET_ID_ANY);
 	if (!pool) {
 		rte_errno = ENOMEM;
 		return NULL;
 	}
-	pool->min_dcs = dcs;
-	if (!priv->counter_fallback)
-		pool->raw = cont->mem_mng->raws + n_valid %
+	if (!fallback) {
+		pool->min_dcs = dcs;
+		pool->raw = cmng->mem_mng->raws + n_valid %
 						      MLX5_CNT_CONTAINER_RESIZE;
+	}
 	pool->raw_hw = NULL;
 	pool->type = 0;
-	pool->type |= (batch ? 0 :  CNT_POOL_TYPE_EXT);
+	pool->type |= (!fallback ? 0 :  CNT_POOL_TYPE_EXT);
 	pool->type |= (!age ? 0 :  CNT_POOL_TYPE_AGE);
 	pool->query_gen = 0;
 	rte_spinlock_init(&pool->sl);
 	TAILQ_INIT(&pool->counters[0]);
 	TAILQ_INIT(&pool->counters[1]);
-	TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+	TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
 	pool->index = n_valid;
-	cont->pools[n_valid] = pool;
-	if (!batch) {
+	cmng->pools[n_valid] = pool;
+	if (fallback) {
 		int base = RTE_ALIGN_FLOOR(dcs->id, MLX5_COUNTERS_PER_POOL);
 
-		if (base < cont->min_id)
-			cont->min_id = base;
-		if (base > cont->max_id)
-			cont->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
-		cont->last_pool_idx = pool->index;
+		if (base < cmng->min_id)
+			cmng->min_id = base;
+		if (base > cmng->max_id)
+			cmng->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
+		cmng->last_pool_idx = pool->index;
 	}
 	/* Pool initialization must be updated before host thread access. */
 	rte_io_wmb();
-	rte_atomic16_add(&cont->n_valid, 1);
+	rte_atomic16_add(&cmng->n_valid, 1);
 	return pool;
 }
 
 /**
- * Restore skipped counters in the pool.
- *
- * As counter pool query requires the first counter dcs
- * ID start with 4 alinged, if the pool counters with
- * min_dcs ID are not aligned with 4, the counters will
- * be skipped.
- * Once other min_dcs ID less than these skipped counter
- * dcs ID appears, the skipped counters will be safe to
- * use.
- * Should be called when min_dcs is updated.
- *
- * @param[in] pool
- *   Current counter pool.
- * @param[in] last_min_dcs
- *   Last min_dcs.
- */
-static void
-flow_dv_counter_restore(struct mlx5_flow_counter_pool *pool,
-			struct mlx5_devx_obj *last_min_dcs)
-{
-	struct mlx5_flow_counter_ext *cnt_ext;
-	uint32_t offset, new_offset;
-	uint32_t skip_cnt = 0;
-	uint32_t i;
-
-	if (!pool->skip_cnt)
-		return;
-	/*
-	 * If last min_dcs is not valid. The skipped counter may even after
-	 * last min_dcs, set the offset to the whole pool.
-	 */
-	if (last_min_dcs->id & (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1))
-		offset = MLX5_COUNTERS_PER_POOL;
-	else
-		offset = last_min_dcs->id % MLX5_COUNTERS_PER_POOL;
-	new_offset = pool->min_dcs->id % MLX5_COUNTERS_PER_POOL;
-	/*
-	 * Check the counters from 1 to the last_min_dcs range. Counters
-	 * before new min_dcs indicates pool still has skipped counters.
-	 * Counters be skipped after new min_dcs will be ready to use.
-	 * Offset 0 counter must be empty or min_dcs, start from 1.
-	 */
-	for (i = 1; i < offset; i++) {
-		cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
-		if (cnt_ext->skipped) {
-			if (i > new_offset) {
-				cnt_ext->skipped = 0;
-				TAILQ_INSERT_TAIL
-					(&pool->counters[pool->query_gen],
-					 MLX5_POOL_GET_CNT(pool, i), next);
-			} else {
-				skip_cnt++;
-			}
-		}
-	}
-	if (!skip_cnt)
-		pool->skip_cnt = 0;
-}
-
-/**
  * Prepare a new counter and/or a new counter pool.
  *
  * @param[in] dev
  *   Pointer to the Ethernet device structure.
  * @param[out] cnt_free
  *   Where to put the pointer of a new counter.
- * @param[in] batch
- *   Whether the pool is for counter that was allocated by batch command.
  * @param[in] age
  *   Whether the pool is for counter that was allocated for aging.
  *
@@ -4580,87 +4499,36 @@ struct field_modify_info modify_tcp[] = {
 static struct mlx5_flow_counter_pool *
 flow_dv_counter_pool_prepare(struct rte_eth_dev *dev,
 			     struct mlx5_flow_counter **cnt_free,
-			     uint32_t batch, uint32_t age)
+			     uint32_t age)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_counters tmp_tq;
-	struct mlx5_devx_obj *last_min_dcs;
 	struct mlx5_devx_obj *dcs = NULL;
 	struct mlx5_flow_counter *cnt;
 	enum mlx5_counter_type cnt_type =
 			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
-	uint32_t add2other;
+	uint32_t fallback = priv->counter_fallback;
 	uint32_t i;
 
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
-	if (!batch) {
-retry:
-		add2other = 0;
+	if (fallback) {
 		/* bulk_bitmap must be 0 for single counter allocation. */
 		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0);
 		if (!dcs)
 			return NULL;
-		pool = flow_dv_find_pool_by_id(cont, dcs->id);
-		/*
-		 * If pool eixsts but with other type, counter will be added
-		 * to the other pool, need to reallocate new counter in the
-		 * ragne with same type later.
-		 */
-		if (pool && ((!!IS_AGE_POOL(pool)) != age)) {
-			add2other = 1;
-		} else if (!pool) {
-			pool = flow_dv_pool_create(dev, dcs, batch,
-						   age);
+		pool = flow_dv_find_pool_by_id(cmng, dcs->id);
+		if (!pool) {
+			pool = flow_dv_pool_create(dev, dcs, age);
 			if (!pool) {
 				mlx5_devx_cmd_destroy(dcs);
 				return NULL;
 			}
 		}
-		if ((dcs->id < pool->min_dcs->id ||
-		    pool->min_dcs->id &
-		    (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1)) &&
-		    !(dcs->id & (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1))) {
-			/*
-			 * Update the pool min_dcs only if current dcs is
-			 * valid and exist min_dcs is not valid or greater
-			 * than new dcs.
-			 */
-			last_min_dcs = pool->min_dcs;
-			rte_atomic64_set(&pool->a64_dcs,
-					 (int64_t)(uintptr_t)dcs);
-			/*
-			 * Restore any skipped counters if the new min_dcs
-			 * ID is smaller or min_dcs is not valid.
-			 */
-			if (dcs->id < last_min_dcs->id ||
-			    last_min_dcs->id &
-			    (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1))
-				flow_dv_counter_restore(pool, last_min_dcs);
-		}
 		i = dcs->id % MLX5_COUNTERS_PER_POOL;
 		cnt = MLX5_POOL_GET_CNT(pool, i);
 		cnt->pool = pool;
 		MLX5_GET_POOL_CNT_EXT(pool, i)->dcs = dcs;
-		/*
-		 * If min_dcs is not valid, it means the new allocated dcs
-		 * also fail to become the valid min_dcs, just skip it.
-		 * Or if min_dcs is valid, and new dcs ID is smaller than
-		 * min_dcs, but not become the min_dcs, also skip it.
-		 */
-		if (pool->min_dcs->id &
-		    (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1) ||
-		    dcs->id < pool->min_dcs->id) {
-			MLX5_GET_POOL_CNT_EXT(pool, i)->skipped = 1;
-			pool->skip_cnt = 1;
-			goto retry;
-		}
-		if (add2other) {
-			TAILQ_INSERT_TAIL(&pool->counters[pool->query_gen],
-					  cnt, next);
-			goto retry;
-		}
 		*cnt_free = cnt;
 		return pool;
 	}
@@ -4671,7 +4539,7 @@ struct field_modify_info modify_tcp[] = {
 		rte_errno = ENODATA;
 		return NULL;
 	}
-	pool = flow_dv_pool_create(dev, dcs, batch, age);
+	pool = flow_dv_pool_create(dev, dcs, age);
 	if (!pool) {
 		mlx5_devx_cmd_destroy(dcs);
 		return NULL;
@@ -4682,9 +4550,9 @@ struct field_modify_info modify_tcp[] = {
 		cnt->pool = pool;
 		TAILQ_INSERT_HEAD(&tmp_tq, cnt, next);
 	}
-	rte_spinlock_lock(&cont->csl);
-	TAILQ_CONCAT(&cont->counters[cnt_type], &tmp_tq, next);
-	rte_spinlock_unlock(&cont->csl);
+	rte_spinlock_lock(&cmng->csl);
+	TAILQ_CONCAT(&cmng->counters[cnt_type], &tmp_tq, next);
+	rte_spinlock_unlock(&cmng->csl);
 	*cnt_free = MLX5_POOL_GET_CNT(pool, 0);
 	(*cnt_free)->pool = pool;
 	return pool;
@@ -4721,8 +4589,6 @@ struct field_modify_info modify_tcp[] = {
  *   Indicate if this counter is shared with other flows.
  * @param[in] id
  *   Counter identifier.
- * @param[in] group
- *   Counter flow group.
  * @param[in] age
  *   Whether the counter was allocated for aging.
  *
@@ -4731,22 +4597,14 @@ struct field_modify_info modify_tcp[] = {
  */
 static uint32_t
 flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id,
-		      uint16_t group, uint32_t age)
+		      uint32_t age)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt_free = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
-	/*
-	 * Currently group 0 flow counter cannot be assigned to a flow if it is
-	 * not the first one in the batch counter allocation, so it is better
-	 * to allocate counters one by one for these flows in a separate
-	 * container.
-	 * A counter can be shared between different groups so need to take
-	 * shared counters from the single container.
-	 */
-	uint32_t batch = (group && !shared && !priv->counter_fallback) ? 1 : 0;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
+	uint32_t fallback = priv->counter_fallback;
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	enum mlx5_counter_type cnt_type =
 			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
 	uint32_t cnt_idx;
@@ -4769,16 +4627,15 @@ struct field_modify_info modify_tcp[] = {
 		}
 	}
 	/* Get free counters from container. */
-	rte_spinlock_lock(&cont->csl);
-	cnt_free = TAILQ_FIRST(&cont->counters[cnt_type]);
+	rte_spinlock_lock(&cmng->csl);
+	cnt_free = TAILQ_FIRST(&cmng->counters[cnt_type]);
 	if (cnt_free)
-		TAILQ_REMOVE(&cont->counters[cnt_type], cnt_free, next);
-	rte_spinlock_unlock(&cont->csl);
-	if (!cnt_free && !flow_dv_counter_pool_prepare(dev, &cnt_free,
-						       batch, age))
+		TAILQ_REMOVE(&cmng->counters[cnt_type], cnt_free, next);
+	rte_spinlock_unlock(&cmng->csl);
+	if (!cnt_free && !flow_dv_counter_pool_prepare(dev, &cnt_free, age))
 		goto err;
 	pool = cnt_free->pool;
-	if (!batch)
+	if (fallback)
 		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt_free);
 	/* Create a DV counter action only in the first time usage. */
 	if (!cnt_free->action) {
@@ -4786,7 +4643,7 @@ struct field_modify_info modify_tcp[] = {
 		struct mlx5_devx_obj *dcs;
 		int ret;
 
-		if (batch) {
+		if (!fallback) {
 			offset = MLX5_CNT_ARRAY_IDX(pool, cnt_free);
 			dcs = pool->min_dcs;
 		} else {
@@ -4802,7 +4659,6 @@ struct field_modify_info modify_tcp[] = {
 	}
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool->index,
 				MLX5_CNT_ARRAY_IDX(pool, cnt_free));
-	cnt_idx += batch * MLX5_CNT_BATCH_OFFSET;
 	/* Update the counter reset values. */
 	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
 				 &cnt_free->bytes))
@@ -4817,16 +4673,16 @@ struct field_modify_info modify_tcp[] = {
 		cnt_free->shared_info.id = id;
 		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
-	if (!priv->counter_fallback && !priv->sh->cmng.query_thread_on)
+	if (!fallback && !priv->sh->cmng.query_thread_on)
 		/* Start the asynchronous batch query by the host thread. */
 		mlx5_set_query_alarm(priv->sh);
 	return cnt_idx;
 err:
 	if (cnt_free) {
 		cnt_free->pool = pool;
-		rte_spinlock_lock(&cont->csl);
-		TAILQ_INSERT_TAIL(&cont->counters[cnt_type], cnt_free, next);
-		rte_spinlock_unlock(&cont->csl);
+		rte_spinlock_lock(&cmng->csl);
+		TAILQ_INSERT_TAIL(&cmng->counters[cnt_type], cnt_free, next);
+		rte_spinlock_unlock(&cmng->csl);
 	}
 	return 0;
 }
@@ -4909,7 +4765,6 @@ struct field_modify_info modify_tcp[] = {
 		return;
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-
 	if (IS_SHARED_CNT(counter)) {
 		if (--cnt->shared_info.ref_cnt)
 			return;
@@ -4934,13 +4789,43 @@ struct field_modify_info modify_tcp[] = {
 	} else {
 		cnt_type = IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
 					       MLX5_COUNTER_TYPE_ORIGIN;
-		TAILQ_INSERT_TAIL(&((MLX5_CNT_CONTAINER
-				  (priv->sh, 0))->counters[cnt_type]),
+		TAILQ_INSERT_TAIL(&priv->sh->cmng.counters[cnt_type],
 				  cnt, next);
 	}
 }
 
 /**
+ * Create a counter action with invalid offset.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ *
+ * @return
+ *   Counter action pointer if success, NULL otherwise.
+ */
+static void*
+flow_dv_counter_create_invalid(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (!priv->cnt_dcs) {
+		priv->cnt_dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx,
+								 0x4);
+		if (!priv->cnt_dcs)
+			return NULL;
+	}
+	if (!priv->cnt_action) {
+		ret = mlx5_flow_os_create_flow_action_count(priv->cnt_dcs->obj,
+							    UINT16_MAX,
+							    &priv->cnt_action);
+		if (ret)
+			return NULL;
+	}
+	return priv->cnt_action;
+}
+
+/**
  * Verify the @p attributes will be correctly understood by the NIC and store
  * them in the @p flow if everything is correct.
  *
@@ -5781,6 +5666,8 @@ struct field_modify_info modify_tcp[] = {
 			action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP;
 			rw_act_num += MLX5_ACT_NUM_SET_DSCP;
 			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_COUNT:
+			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
 						  RTE_FLOW_ERROR_TYPE_ACTION,
@@ -7988,8 +7875,7 @@ struct field_modify_info modify_tcp[] = {
 
 	counter = flow_dv_counter_alloc(dev,
 				count ? count->shared : 0,
-				count ? count->id : 0,
-				dev_flow->dv.group, !!age);
+				count ? count->id : 0, !!age);
 	if (!counter || age == NULL)
 		return counter;
 	age_param  = flow_dv_counter_idx_get_age(dev, counter);
@@ -8359,6 +8245,13 @@ struct field_modify_info modify_tcp[] = {
 				age = action->conf;
 			action_flags |= MLX5_FLOW_ACTION_COUNT;
 			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_COUNT:
+			if (flow_dv_counter_create_invalid(dev) == NULL)
+				return rte_flow_error_set(error, ENOTSUP,
+				       RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				       "count action not supported");
+			dev_flow->dv.actions[actions_n++] = priv->cnt_action;
+			break;
 		case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
 			dev_flow->dv.actions[actions_n++] =
 						priv->sh->pop_vlan_action;
@@ -10112,7 +10005,7 @@ struct field_modify_info modify_tcp[] = {
 	uint32_t cnt;
 
 	flow_dv_shared_lock(dev);
-	cnt = flow_dv_counter_alloc(dev, 0, 0, 1, 0);
+	cnt = flow_dv_counter_alloc(dev, 0, 0, 0);
 	flow_dv_shared_unlock(dev);
 	return cnt;
 }
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 0463bea..f3b0e89 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -159,11 +159,11 @@
 			      struct mlx5_flow_counter_pool **ppool)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
 
 	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
-	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+	pool = cmng->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
 	if (ppool)
 		*ppool = pool;
@@ -254,12 +254,12 @@
 flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
 	union mlx5_l3t_data data;
-	uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+	uint32_t n_valid = rte_atomic16_read(&cmng->n_valid);
 	uint32_t pool_idx, cnt_idx;
 	uint32_t i;
 	int ret;
@@ -275,7 +275,7 @@
 		return data.dword;
 	}
 	for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
-		pool = cont->pools[pool_idx];
+		pool = cmng->pools[pool_idx];
 		if (!pool)
 			continue;
 		cnt = TAILQ_FIRST(&pool->counters[0]);
@@ -286,7 +286,7 @@
 		struct mlx5_flow_counter_pool **pools;
 		uint32_t size;
 
-		if (n_valid == cont->n) {
+		if (n_valid == cmng->n) {
 			/* Resize the container pool array. */
 			size = sizeof(struct mlx5_flow_counter_pool *) *
 				     (n_valid + MLX5_CNT_CONTAINER_RESIZE);
@@ -295,13 +295,13 @@
 			if (!pools)
 				return 0;
 			if (n_valid) {
-				memcpy(pools, cont->pools,
+				memcpy(pools, cmng->pools,
 				       sizeof(struct mlx5_flow_counter_pool *) *
 				       n_valid);
-				mlx5_free(cont->pools);
+				mlx5_free(cmng->pools);
 			}
-			cont->pools = pools;
-			cont->n += MLX5_CNT_CONTAINER_RESIZE;
+			cmng->pools = pools;
+			cmng->n += MLX5_CNT_CONTAINER_RESIZE;
 		}
 		/* Allocate memory for new pool*/
 		size = sizeof(*pool) + (sizeof(*cnt_ext) + sizeof(*cnt)) *
@@ -315,10 +315,10 @@
 			TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
 		}
 		cnt = MLX5_POOL_GET_CNT(pool, 0);
-		cont->pools[n_valid] = pool;
+		cmng->pools[n_valid] = pool;
 		pool_idx = n_valid;
-		rte_atomic16_add(&cont->n_valid, 1);
-		TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+		rte_atomic16_add(&cmng->n_valid, 1);
+		TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
 	}
 	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool_idx, i);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 4/6] net/mlx5: synchronize flow counter pool creation
  2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
                   ` (2 preceding siblings ...)
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 3/6] net/mlx5: remove single counter container Suanming Mou
@ 2020-10-06 11:38 ` Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 5/6] net/mlx5: make three level table thread safe Suanming Mou
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-06 11:38 UTC (permalink / raw)
  To: viacheslavo, matan; +Cc: rasland, dev

Currently, counter operations are not thread safe as the counter
pools' array resize is not protected.

This commit protects the container pools' array resize using a spinlock.
The original counter pool statistic memory allocate is moved to the
host thread in order to minimize the critical section. Since that pool
statistic memory is required only in query time. The container pools'
array should be resized by the user threads, the new pool may be used
by other rte_flow APIs before the host thread resize is done, if the
pool is not saved to the pools' array, the specified counter memory will
not be found as the pool is not saved to the counter management pool
array. The pool raw statistic memory will be filled in host thread.

The shared counters will be protected in other commit.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/net/mlx5/mlx5.c            |  12 ++-
 drivers/net/mlx5/mlx5.h            |  11 ++-
 drivers/net/mlx5/mlx5_flow.c       | 127 ++++++++++++++++++++++++++++++--
 drivers/net/mlx5/mlx5_flow_dv.c    | 146 ++++++-------------------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |   5 +-
 5 files changed, 160 insertions(+), 141 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 96cebba..79c5563 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -465,8 +465,7 @@ struct mlx5_flow_id_pool *
 	sh->cmng.min_id = MLX5_CNT_BATCH_OFFSET;
 	sh->cmng.max_id = -1;
 	sh->cmng.last_pool_idx = POOL_IDX_INVALID;
-	TAILQ_INIT(&sh->cmng.pool_list);
-	rte_spinlock_init(&sh->cmng.resize_sl);
+	rte_spinlock_init(&sh->cmng.pool_update_sl);
 	for (i = 0; i < MLX5_COUNTER_TYPE_MAX; i++)
 		TAILQ_INIT(&sh->cmng.counters[i]);
 	rte_spinlock_init(&sh->cmng.csl);
@@ -499,7 +498,7 @@ struct mlx5_flow_id_pool *
 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
 {
 	struct mlx5_counter_stats_mem_mng *mng;
-	int j;
+	int i, j;
 	int retries = 1024;
 
 	rte_errno = 0;
@@ -512,9 +511,10 @@ struct mlx5_flow_id_pool *
 
 	if (sh->cmng.pools) {
 		struct mlx5_flow_counter_pool *pool;
+		int n_valid = sh->cmng.n_valid;
 
-		pool = TAILQ_FIRST(&sh->cmng.pool_list);
-		while (pool) {
+		for (i = 0; i < n_valid; ++i) {
+			pool = sh->cmng.pools[i];
 			if (!IS_EXT_POOL(pool) && pool->min_dcs)
 				claim_zero(mlx5_devx_cmd_destroy
 							       (pool->min_dcs));
@@ -530,9 +530,7 @@ struct mlx5_flow_id_pool *
 						   (MLX5_GET_POOL_CNT_EXT
 						    (pool, j)->dcs));
 			}
-			TAILQ_REMOVE(&sh->cmng.pool_list, pool, next);
 			mlx5_free(pool);
-			pool = TAILQ_FIRST(&sh->cmng.pool_list);
 		}
 		mlx5_free(sh->cmng.pools);
 	}
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index a3d4ad9..8c951e2 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -395,7 +395,11 @@ struct mlx5_flow_counter_pool {
 	volatile uint32_t query_gen:1; /* Query round. */
 	rte_spinlock_t sl; /* The pool lock. */
 	struct mlx5_counter_stats_raw *raw;
-	struct mlx5_counter_stats_raw *raw_hw; /* The raw on HW working. */
+	union {
+		struct rte_eth_dev *dev; /* The counter pool create device. */
+		struct mlx5_counter_stats_raw *raw_hw;
+		/* The raw on HW working. */
+	};
 };
 
 struct mlx5_counter_stats_raw;
@@ -419,16 +423,15 @@ struct mlx5_counter_stats_raw {
 
 /* Counter global management structure. */
 struct mlx5_flow_counter_mng {
-	rte_atomic16_t n_valid; /* Number of valid pools. */
+	volatile uint16_t n_valid; /* Number of valid pools. */
 	uint16_t n; /* Number of pools. */
 	uint16_t last_pool_idx; /* Last used pool index */
 	int min_id; /* The minimum counter ID in the pools. */
 	int max_id; /* The maximum counter ID in the pools. */
-	rte_spinlock_t resize_sl; /* The resize lock. */
+	rte_spinlock_t pool_update_sl; /* The pool update lock. */
 	rte_spinlock_t csl; /* The counter free list lock. */
 	struct mlx5_counters counters[MLX5_COUNTER_TYPE_MAX];
 	/* Free counter list. */
-	struct mlx5_counter_pools pool_list; /* Counter pool list. */
 	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
 	struct mlx5_counter_stats_mem_mng *mem_mng;
 	/* Hold the memory management for the next allocated pools raws. */
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c280f56..a9664b8 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -13,6 +13,7 @@
 #include <rte_common.h>
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
+#include <rte_eal_paging.h>
 #include <rte_flow.h>
 #include <rte_cycles.h>
 #include <rte_flow_driver.h>
@@ -29,6 +30,7 @@
 #include "mlx5_flow.h"
 #include "mlx5_flow_os.h"
 #include "mlx5_rxtx.h"
+#include "mlx5_common_os.h"
 
 /** Device flow drivers. */
 extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops;
@@ -5880,6 +5882,116 @@ struct mlx5_meter_domains_infos *
 	return -ENOTSUP;
 }
 
+/**
+ * Allocate a new memory for the counter values wrapped by all the needed
+ * management.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] raws_n
+ *   The raw memory areas - each one for MLX5_COUNTERS_PER_POOL counters.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_flow_create_counter_stat_mem_mng(struct rte_eth_dev *dev, int raws_n)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_devx_mkey_attr mkey_attr;
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	volatile struct flow_counter_stats *raw_data;
+	int size = (sizeof(struct flow_counter_stats) *
+			MLX5_COUNTERS_PER_POOL +
+			sizeof(struct mlx5_counter_stats_raw)) * raws_n +
+			sizeof(struct mlx5_counter_stats_mem_mng);
+	size_t pgsize = rte_mem_page_size();
+	if (pgsize == (size_t)-1) {
+		DRV_LOG(ERR, "Failed to get mem page size");
+		rte_errno = ENOMEM;
+		return -ENOMEM;
+	}
+	uint8_t *mem = mlx5_malloc(MLX5_MEM_ZERO, size, pgsize,
+				  SOCKET_ID_ANY);
+	int i;
+
+	if (!mem) {
+		rte_errno = ENOMEM;
+		return -ENOMEM;
+	}
+	mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1;
+	size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n;
+	mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size,
+						 IBV_ACCESS_LOCAL_WRITE);
+	if (!mem_mng->umem) {
+		rte_errno = errno;
+		mlx5_free(mem);
+		return -rte_errno;
+	}
+	mkey_attr.addr = (uintptr_t)mem;
+	mkey_attr.size = size;
+	mkey_attr.umem_id = mlx5_os_get_umem_id(mem_mng->umem);
+	mkey_attr.pd = sh->pdn;
+	mkey_attr.log_entity_size = 0;
+	mkey_attr.pg_access = 0;
+	mkey_attr.klm_array = NULL;
+	mkey_attr.klm_num = 0;
+	if (priv->config.hca_attr.relaxed_ordering_write &&
+		priv->config.hca_attr.relaxed_ordering_read  &&
+		!haswell_broadwell_cpu)
+		mkey_attr.relaxed_ordering = 1;
+	mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr);
+	if (!mem_mng->dm) {
+		mlx5_glue->devx_umem_dereg(mem_mng->umem);
+		rte_errno = errno;
+		mlx5_free(mem);
+		return -rte_errno;
+	}
+	mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size);
+	raw_data = (volatile struct flow_counter_stats *)mem;
+	for (i = 0; i < raws_n; ++i) {
+		mem_mng->raws[i].mem_mng = mem_mng;
+		mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL;
+	}
+	for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i)
+		LIST_INSERT_HEAD(&priv->sh->cmng.free_stat_raws,
+				 mem_mng->raws + MLX5_CNT_CONTAINER_RESIZE + i,
+				 next);
+	LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next);
+	priv->sh->cmng.mem_mng = mem_mng;
+	return 0;
+}
+
+/**
+ * Set the statistic memory to the new counter pool.
+ *
+ * @param[in] cmng
+ *   Pointer to the counter management.
+ * @param[in] pool
+ *   Pointer to the pool to set the statistic memory.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_flow_set_counter_stat_mem(struct mlx5_flow_counter_mng *cmng,
+			       struct mlx5_flow_counter_pool *pool)
+{
+	/* Resize statistic memory once used out. */
+	if (!(pool->index % MLX5_CNT_CONTAINER_RESIZE) &&
+	    mlx5_flow_create_counter_stat_mem_mng(pool->dev,
+	    MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES)) {
+		DRV_LOG(ERR, "Cannot resize counter stat mem.");
+		return -1;
+	}
+	MLX5_ASSERT(pool->index < n_valid);
+	pool->raw = cmng->mem_mng->raws + pool->index %
+		    MLX5_CNT_CONTAINER_RESIZE;
+	pool->raw_hw = NULL;
+	return 0;
+}
+
 #define MLX5_POOL_QUERY_FREQ_US 1000000
 
 /**
@@ -5894,7 +6006,7 @@ struct mlx5_meter_domains_infos *
 {
 	uint32_t pools_n, us;
 
-	pools_n = rte_atomic16_read(&sh->cmng.n_valid);
+	pools_n = sh->cmng.n_valid;
 	us = MLX5_POOL_QUERY_FREQ_US / pools_n;
 	DRV_LOG(DEBUG, "Set alarm for %u pools each %u us", pools_n, us);
 	if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) {
@@ -5920,16 +6032,21 @@ struct mlx5_meter_domains_infos *
 	uint16_t pool_index = sh->cmng.pool_index;
 	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
+	int n_valid;
 
 	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
 		goto set_alarm;
-	rte_spinlock_lock(&cmng->resize_sl);
+	rte_spinlock_lock(&cmng->pool_update_sl);
 	if (!cmng->pools) {
-		rte_spinlock_unlock(&cmng->resize_sl);
+		rte_spinlock_unlock(&cmng->pool_update_sl);
 		goto set_alarm;
 	}
 	pool = cmng->pools[pool_index];
-	rte_spinlock_unlock(&cmng->resize_sl);
+	n_valid = cmng->n_valid;
+	rte_spinlock_unlock(&cmng->pool_update_sl);
+	/* Set the statistic memory to the new created pool. */
+	if ((!pool->raw && mlx5_flow_set_counter_stat_mem(cmng, pool)))
+		goto set_alarm;
 	if (pool->raw_hw)
 		/* There is a pool query in progress. */
 		goto set_alarm;
@@ -5962,7 +6079,7 @@ struct mlx5_meter_domains_infos *
 	LIST_REMOVE(pool->raw_hw, next);
 	sh->cmng.pending_queries++;
 	pool_index++;
-	if (pool_index >= rte_atomic16_read(&cmng->n_valid))
+	if (pool_index >= n_valid)
 		pool_index = 0;
 set_alarm:
 	sh->cmng.pool_index = pool_index;
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 43d8ea8..31d7fe4 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4231,7 +4231,7 @@ struct field_modify_info modify_tcp[] = {
 	 * ID is sequence increasing, and the last pool should be the needed
 	 * one.
 	 */
-	i = rte_atomic16_read(&cmng->n_valid);
+	i = cmng->n_valid;
 	while (i--) {
 		struct mlx5_flow_counter_pool *pool = cmng->pools[i];
 
@@ -4242,83 +4242,6 @@ struct field_modify_info modify_tcp[] = {
 }
 
 /**
- * Allocate a new memory for the counter values wrapped by all the needed
- * management.
- *
- * @param[in] dev
- *   Pointer to the Ethernet device structure.
- * @param[in] raws_n
- *   The raw memory areas - each one for MLX5_COUNTERS_PER_POOL counters.
- *
- * @return
- *   The new memory management pointer on success, otherwise NULL and rte_errno
- *   is set.
- */
-static struct mlx5_counter_stats_mem_mng *
-flow_dv_create_counter_stat_mem_mng(struct rte_eth_dev *dev, int raws_n)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_dev_ctx_shared *sh = priv->sh;
-	struct mlx5_devx_mkey_attr mkey_attr;
-	struct mlx5_counter_stats_mem_mng *mem_mng;
-	volatile struct flow_counter_stats *raw_data;
-	int size = (sizeof(struct flow_counter_stats) *
-			MLX5_COUNTERS_PER_POOL +
-			sizeof(struct mlx5_counter_stats_raw)) * raws_n +
-			sizeof(struct mlx5_counter_stats_mem_mng);
-	size_t pgsize = rte_mem_page_size();
-	if (pgsize == (size_t)-1) {
-		DRV_LOG(ERR, "Failed to get mem page size");
-		rte_errno = ENOMEM;
-		return NULL;
-	}
-	uint8_t *mem = mlx5_malloc(MLX5_MEM_ZERO, size, pgsize,
-				  SOCKET_ID_ANY);
-	int i;
-
-	if (!mem) {
-		rte_errno = ENOMEM;
-		return NULL;
-	}
-	mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1;
-	size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n;
-	mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size,
-						 IBV_ACCESS_LOCAL_WRITE);
-	if (!mem_mng->umem) {
-		rte_errno = errno;
-		mlx5_free(mem);
-		return NULL;
-	}
-	mkey_attr.addr = (uintptr_t)mem;
-	mkey_attr.size = size;
-	mkey_attr.umem_id = mlx5_os_get_umem_id(mem_mng->umem);
-	mkey_attr.pd = sh->pdn;
-	mkey_attr.log_entity_size = 0;
-	mkey_attr.pg_access = 0;
-	mkey_attr.klm_array = NULL;
-	mkey_attr.klm_num = 0;
-	if (priv->config.hca_attr.relaxed_ordering_write &&
-		priv->config.hca_attr.relaxed_ordering_read  &&
-		!haswell_broadwell_cpu)
-		mkey_attr.relaxed_ordering = 1;
-	mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr);
-	if (!mem_mng->dm) {
-		mlx5_glue->devx_umem_dereg(mem_mng->umem);
-		rte_errno = errno;
-		mlx5_free(mem);
-		return NULL;
-	}
-	mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size);
-	raw_data = (volatile struct flow_counter_stats *)mem;
-	for (i = 0; i < raws_n; ++i) {
-		mem_mng->raws[i].mem_mng = mem_mng;
-		mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL;
-	}
-	LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next);
-	return mem_mng;
-}
-
-/**
  * Resize a counter container.
  *
  * @param[in] dev
@@ -4332,7 +4255,6 @@ struct field_modify_info modify_tcp[] = {
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
-	struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
 	void *old_pools = cmng->pools;
 	uint32_t resize = cmng->n + MLX5_CNT_CONTAINER_RESIZE;
 	uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize;
@@ -4345,30 +4267,8 @@ struct field_modify_info modify_tcp[] = {
 	if (old_pools)
 		memcpy(pools, old_pools, cmng->n *
 				       sizeof(struct mlx5_flow_counter_pool *));
-	/*
-	 * Fallback mode query the counter directly, no background query
-	 * resources are needed.
-	 */
-	if (!priv->counter_fallback) {
-		int i;
-
-		mem_mng = flow_dv_create_counter_stat_mem_mng(dev,
-			  MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES);
-		if (!mem_mng) {
-			mlx5_free(pools);
-			return -ENOMEM;
-		}
-		for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i)
-			LIST_INSERT_HEAD(&priv->sh->cmng.free_stat_raws,
-					 mem_mng->raws +
-					 MLX5_CNT_CONTAINER_RESIZE +
-					 i, next);
-	}
-	rte_spinlock_lock(&cmng->resize_sl);
 	cmng->n = resize;
-	cmng->mem_mng = mem_mng;
 	cmng->pools = pools;
-	rte_spinlock_unlock(&cmng->resize_sl);
 	if (old_pools)
 		mlx5_free(old_pools);
 	return 0;
@@ -4406,11 +4306,15 @@ struct field_modify_info modify_tcp[] = {
 		return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
 					0, pkts, bytes, 0, NULL, NULL, 0);
 	}
-
 	rte_spinlock_lock(&pool->sl);
-	offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
-	*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
-	*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+	if (!pool->raw) {
+		*pkts = 0;
+		*bytes = 0;
+	} else {
+		offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
+		*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+		*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+	}
 	rte_spinlock_unlock(&pool->sl);
 	return 0;
 }
@@ -4437,12 +4341,9 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
-	int16_t n_valid = rte_atomic16_read(&cmng->n_valid);
 	uint32_t fallback = priv->counter_fallback;
 	uint32_t size = sizeof(*pool);
 
-	if (cmng->n == n_valid && flow_dv_container_resize(dev))
-		return NULL;
 	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
 	size += (!fallback ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
 	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE);
@@ -4451,23 +4352,25 @@ struct field_modify_info modify_tcp[] = {
 		rte_errno = ENOMEM;
 		return NULL;
 	}
-	if (!fallback) {
-		pool->min_dcs = dcs;
-		pool->raw = cmng->mem_mng->raws + n_valid %
-						      MLX5_CNT_CONTAINER_RESIZE;
-	}
-	pool->raw_hw = NULL;
+	pool->raw = NULL;
 	pool->type = 0;
-	pool->type |= (!fallback ? 0 :  CNT_POOL_TYPE_EXT);
 	pool->type |= (!age ? 0 :  CNT_POOL_TYPE_AGE);
 	pool->query_gen = 0;
+	pool->min_dcs = dcs;
 	rte_spinlock_init(&pool->sl);
 	TAILQ_INIT(&pool->counters[0]);
 	TAILQ_INIT(&pool->counters[1]);
-	TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
-	pool->index = n_valid;
-	cmng->pools[n_valid] = pool;
-	if (fallback) {
+	rte_spinlock_lock(&cmng->pool_update_sl);
+	pool->index = cmng->n_valid;
+	if (pool->index == cmng->n && flow_dv_container_resize(dev)) {
+		mlx5_free(pool);
+		rte_spinlock_unlock(&cmng->pool_update_sl);
+		return NULL;
+	}
+	cmng->pools[pool->index] = pool;
+	pool->dev = dev;
+	cmng->n_valid++;
+	if (unlikely(fallback)) {
 		int base = RTE_ALIGN_FLOOR(dcs->id, MLX5_COUNTERS_PER_POOL);
 
 		if (base < cmng->min_id)
@@ -4475,10 +4378,9 @@ struct field_modify_info modify_tcp[] = {
 		if (base > cmng->max_id)
 			cmng->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
 		cmng->last_pool_idx = pool->index;
+		pool->type |= CNT_POOL_TYPE_EXT;
 	}
-	/* Pool initialization must be updated before host thread access. */
-	rte_io_wmb();
-	rte_atomic16_add(&cmng->n_valid, 1);
+	rte_spinlock_unlock(&cmng->pool_update_sl);
 	return pool;
 }
 
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index f3b0e89..69ecc27 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -259,7 +259,7 @@
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
 	union mlx5_l3t_data data;
-	uint32_t n_valid = rte_atomic16_read(&cmng->n_valid);
+	uint32_t n_valid = cmng->n_valid;
 	uint32_t pool_idx, cnt_idx;
 	uint32_t i;
 	int ret;
@@ -317,8 +317,7 @@
 		cnt = MLX5_POOL_GET_CNT(pool, 0);
 		cmng->pools[n_valid] = pool;
 		pool_idx = n_valid;
-		rte_atomic16_add(&cmng->n_valid, 1);
-		TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
+		cmng->n_valid++;
 	}
 	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool_idx, i);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 5/6] net/mlx5: make three level table thread safe
  2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
                   ` (3 preceding siblings ...)
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 4/6] net/mlx5: synchronize flow counter pool creation Suanming Mou
@ 2020-10-06 11:38 ` Suanming Mou
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 6/6] net/mlx5: make shared counters " Suanming Mou
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
  6 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-06 11:38 UTC (permalink / raw)
  To: viacheslavo, matan; +Cc: rasland, dev

This commit adds thread safety support in three level table using
spinlock and reference counter for each the table entry.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/net/mlx5/mlx5_utils.c | 144 +++++++++++++++++++++++++++++++-----------
 drivers/net/mlx5/mlx5_utils.h |  52 ++++++++++-----
 2 files changed, 142 insertions(+), 54 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_utils.c b/drivers/net/mlx5/mlx5_utils.c
index fefe833..f3c259d 100644
--- a/drivers/net/mlx5/mlx5_utils.c
+++ b/drivers/net/mlx5/mlx5_utils.c
@@ -551,26 +551,23 @@ struct mlx5_l3t_tbl *
 	tbl->type = type;
 	switch (type) {
 	case MLX5_L3T_TYPE_WORD:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_word) +
-				  sizeof(uint16_t) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_word);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_w";
 		break;
 	case MLX5_L3T_TYPE_DWORD:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_dword) +
-				  sizeof(uint32_t) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_dword);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_dw";
 		break;
 	case MLX5_L3T_TYPE_QWORD:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_qword) +
-				  sizeof(uint64_t) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_qword);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_qw";
 		break;
 	default:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_ptr) +
-				  sizeof(void *) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_ptr);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_tpr";
 		break;
 	}
+	rte_spinlock_init(&tbl->sl);
 	tbl->eip = mlx5_ipool_create(&l3t_ip_cfg);
 	if (!tbl->eip) {
 		rte_errno = ENOMEM;
@@ -620,46 +617,63 @@ struct mlx5_l3t_tbl *
 	mlx5_free(tbl);
 }
 
-uint32_t
+int32_t
 mlx5_l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
 		   union mlx5_l3t_data *data)
 {
 	struct mlx5_l3t_level_tbl *g_tbl, *m_tbl;
+	struct mlx5_l3t_entry_word *w_e_tbl;
+	struct mlx5_l3t_entry_dword *dw_e_tbl;
+	struct mlx5_l3t_entry_qword *qw_e_tbl;
+	struct mlx5_l3t_entry_ptr *ptr_e_tbl;
 	void *e_tbl;
 	uint32_t entry_idx;
+	int32_t ret = -1;
 
+	rte_spinlock_lock(&tbl->sl);
 	g_tbl = tbl->tbl;
 	if (!g_tbl)
-		return -1;
+		goto out;
 	m_tbl = g_tbl->tbl[(idx >> MLX5_L3T_GT_OFFSET) & MLX5_L3T_GT_MASK];
 	if (!m_tbl)
-		return -1;
+		goto out;
 	e_tbl = m_tbl->tbl[(idx >> MLX5_L3T_MT_OFFSET) & MLX5_L3T_MT_MASK];
 	if (!e_tbl)
-		return -1;
+		goto out;
+	ret = 0;
 	entry_idx = idx & MLX5_L3T_ET_MASK;
 	switch (tbl->type) {
 	case MLX5_L3T_TYPE_WORD:
-		data->word = ((struct mlx5_l3t_entry_word *)e_tbl)->entry
-			     [entry_idx];
+		w_e_tbl = (struct mlx5_l3t_entry_word *)e_tbl;
+		data->word = w_e_tbl->entry[entry_idx].data;
+		if (w_e_tbl->entry[entry_idx].data)
+			w_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_DWORD:
-		data->dword = ((struct mlx5_l3t_entry_dword *)e_tbl)->entry
-			     [entry_idx];
+		dw_e_tbl = (struct mlx5_l3t_entry_dword *)e_tbl;
+		data->dword = dw_e_tbl->entry[entry_idx].data;
+		if (dw_e_tbl->entry[entry_idx].data)
+			dw_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_QWORD:
-		data->qword = ((struct mlx5_l3t_entry_qword *)e_tbl)->entry
-			      [entry_idx];
+		qw_e_tbl = (struct mlx5_l3t_entry_qword *)e_tbl;
+		data->qword = qw_e_tbl->entry[entry_idx].data;
+		if (qw_e_tbl->entry[entry_idx].data)
+			qw_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	default:
-		data->ptr = ((struct mlx5_l3t_entry_ptr *)e_tbl)->entry
-			    [entry_idx];
+		ptr_e_tbl = (struct mlx5_l3t_entry_ptr *)e_tbl;
+		data->ptr = ptr_e_tbl->entry[entry_idx].data;
+		if (ptr_e_tbl->entry[entry_idx].data)
+			ptr_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	}
-	return 0;
+out:
+	rte_spinlock_unlock(&tbl->sl);
+	return ret;
 }
 
-void
+int32_t
 mlx5_l3t_clear_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx)
 {
 	struct mlx5_l3t_level_tbl *g_tbl, *m_tbl;
@@ -670,36 +684,54 @@ struct mlx5_l3t_tbl *
 	void *e_tbl;
 	uint32_t entry_idx;
 	uint64_t ref_cnt;
+	int32_t ret = -1;
 
+	rte_spinlock_lock(&tbl->sl);
 	g_tbl = tbl->tbl;
 	if (!g_tbl)
-		return;
+		goto out;
 	m_tbl = g_tbl->tbl[(idx >> MLX5_L3T_GT_OFFSET) & MLX5_L3T_GT_MASK];
 	if (!m_tbl)
-		return;
+		goto out;
 	e_tbl = m_tbl->tbl[(idx >> MLX5_L3T_MT_OFFSET) & MLX5_L3T_MT_MASK];
 	if (!e_tbl)
-		return;
+		goto out;
 	entry_idx = idx & MLX5_L3T_ET_MASK;
 	switch (tbl->type) {
 	case MLX5_L3T_TYPE_WORD:
 		w_e_tbl = (struct mlx5_l3t_entry_word *)e_tbl;
-		w_e_tbl->entry[entry_idx] = 0;
+		MLX5_ASSERT(w_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --w_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		w_e_tbl->entry[entry_idx].data = 0;
 		ref_cnt = --w_e_tbl->ref_cnt;
 		break;
 	case MLX5_L3T_TYPE_DWORD:
 		dw_e_tbl = (struct mlx5_l3t_entry_dword *)e_tbl;
-		dw_e_tbl->entry[entry_idx] = 0;
+		MLX5_ASSERT(dw_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --dw_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		dw_e_tbl->entry[entry_idx].data = 0;
 		ref_cnt = --dw_e_tbl->ref_cnt;
 		break;
 	case MLX5_L3T_TYPE_QWORD:
 		qw_e_tbl = (struct mlx5_l3t_entry_qword *)e_tbl;
-		qw_e_tbl->entry[entry_idx] = 0;
+		MLX5_ASSERT(qw_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --qw_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		qw_e_tbl->entry[entry_idx].data = 0;
 		ref_cnt = --qw_e_tbl->ref_cnt;
 		break;
 	default:
 		ptr_e_tbl = (struct mlx5_l3t_entry_ptr *)e_tbl;
-		ptr_e_tbl->entry[entry_idx] = NULL;
+		MLX5_ASSERT(ptr_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --ptr_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		ptr_e_tbl->entry[entry_idx].data = NULL;
 		ref_cnt = --ptr_e_tbl->ref_cnt;
 		break;
 	}
@@ -718,9 +750,12 @@ struct mlx5_l3t_tbl *
 			}
 		}
 	}
+out:
+	rte_spinlock_unlock(&tbl->sl);
+	return ret;
 }
 
-uint32_t
+int32_t
 mlx5_l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
 		   union mlx5_l3t_data *data)
 {
@@ -731,8 +766,10 @@ struct mlx5_l3t_tbl *
 	struct mlx5_l3t_entry_ptr *ptr_e_tbl;
 	void *e_tbl;
 	uint32_t entry_idx, tbl_idx = 0;
+	int32_t ret = -1;
 
 	/* Check the global table, create it if empty. */
+	rte_spinlock_lock(&tbl->sl);
 	g_tbl = tbl->tbl;
 	if (!g_tbl) {
 		g_tbl = mlx5_malloc(MLX5_MEM_ZERO,
@@ -741,7 +778,7 @@ struct mlx5_l3t_tbl *
 				    SOCKET_ID_ANY);
 		if (!g_tbl) {
 			rte_errno = ENOMEM;
-			return -1;
+			goto out;
 		}
 		tbl->tbl = g_tbl;
 	}
@@ -757,7 +794,7 @@ struct mlx5_l3t_tbl *
 				    SOCKET_ID_ANY);
 		if (!m_tbl) {
 			rte_errno = ENOMEM;
-			return -1;
+			goto out;
 		}
 		g_tbl->tbl[(idx >> MLX5_L3T_GT_OFFSET) & MLX5_L3T_GT_MASK] =
 									m_tbl;
@@ -772,7 +809,7 @@ struct mlx5_l3t_tbl *
 		e_tbl = mlx5_ipool_zmalloc(tbl->eip, &tbl_idx);
 		if (!e_tbl) {
 			rte_errno = ENOMEM;
-			return -1;
+			goto out;
 		}
 		((struct mlx5_l3t_entry_word *)e_tbl)->idx = tbl_idx;
 		m_tbl->tbl[(idx >> MLX5_L3T_MT_OFFSET) & MLX5_L3T_MT_MASK] =
@@ -783,24 +820,55 @@ struct mlx5_l3t_tbl *
 	switch (tbl->type) {
 	case MLX5_L3T_TYPE_WORD:
 		w_e_tbl = (struct mlx5_l3t_entry_word *)e_tbl;
-		w_e_tbl->entry[entry_idx] = data->word;
+		if (w_e_tbl->entry[entry_idx].data) {
+			data->word = w_e_tbl->entry[entry_idx].data;
+			w_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			goto out;
+		}
+		w_e_tbl->entry[entry_idx].data = data->word;
+		w_e_tbl->entry[entry_idx].ref_cnt = 1;
 		w_e_tbl->ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_DWORD:
 		dw_e_tbl = (struct mlx5_l3t_entry_dword *)e_tbl;
-		dw_e_tbl->entry[entry_idx] = data->dword;
+		if (dw_e_tbl->entry[entry_idx].data) {
+			data->dword = dw_e_tbl->entry[entry_idx].data;
+			dw_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			goto out;
+		}
+		dw_e_tbl->entry[entry_idx].data = data->dword;
+		dw_e_tbl->entry[entry_idx].ref_cnt = 1;
 		dw_e_tbl->ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_QWORD:
 		qw_e_tbl = (struct mlx5_l3t_entry_qword *)e_tbl;
-		qw_e_tbl->entry[entry_idx] = data->qword;
+		if (qw_e_tbl->entry[entry_idx].data) {
+			data->qword = qw_e_tbl->entry[entry_idx].data;
+			qw_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			goto out;
+		}
+		qw_e_tbl->entry[entry_idx].data = data->qword;
+		qw_e_tbl->entry[entry_idx].ref_cnt = 1;
 		qw_e_tbl->ref_cnt++;
 		break;
 	default:
 		ptr_e_tbl = (struct mlx5_l3t_entry_ptr *)e_tbl;
-		ptr_e_tbl->entry[entry_idx] = data->ptr;
+		if (ptr_e_tbl->entry[entry_idx].data) {
+			data->ptr = ptr_e_tbl->entry[entry_idx].data;
+			ptr_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			goto out;
+		}
+		ptr_e_tbl->entry[entry_idx].data = data->ptr;
+		ptr_e_tbl->entry[entry_idx].ref_cnt = 1;
 		ptr_e_tbl->ref_cnt++;
 		break;
 	}
-	return 0;
+	ret = 0;
+out:
+	rte_spinlock_unlock(&tbl->sl);
+	return ret;
 }
diff --git a/drivers/net/mlx5/mlx5_utils.h b/drivers/net/mlx5/mlx5_utils.h
index f078bdc..0da4961 100644
--- a/drivers/net/mlx5/mlx5_utils.h
+++ b/drivers/net/mlx5/mlx5_utils.h
@@ -118,29 +118,41 @@ struct mlx5_l3t_level_tbl {
 struct mlx5_l3t_entry_word {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	uint16_t entry[]; /* Entry array. */
-};
+	struct {
+		uint16_t data;
+		uint32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 double word entry table data structure. */
 struct mlx5_l3t_entry_dword {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	uint32_t entry[]; /* Entry array. */
-};
+	struct {
+		uint32_t data;
+		int32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 quad word entry table data structure. */
 struct mlx5_l3t_entry_qword {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	uint64_t entry[]; /* Entry array. */
-};
+	struct {
+		uint64_t data;
+		uint32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 pointer entry table data structure. */
 struct mlx5_l3t_entry_ptr {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	void *entry[]; /* Entry array. */
-};
+	struct {
+		void *data;
+		uint32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 table data structure. */
 struct mlx5_l3t_tbl {
@@ -148,6 +160,7 @@ struct mlx5_l3t_tbl {
 	struct mlx5_indexed_pool *eip;
 	/* Table index pool handles. */
 	struct mlx5_l3t_level_tbl *tbl; /* Global table index. */
+	rte_spinlock_t sl; /* The table lock. */
 };
 
 /*
@@ -535,32 +548,39 @@ struct mlx5_indexed_pool *
  *   0 if success, -1 on error.
  */
 
-uint32_t mlx5_l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+int32_t mlx5_l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
 			    union mlx5_l3t_data *data);
 /**
- * This function clears the index entry from Three-level table.
+ * This function decreases and clear index entry if reference
+ * counter is 0 from Three-level table.
  *
  * @param tbl
  *   Pointer to the l3t.
  * @param idx
  *   Index to the entry.
+ *
+ * @return
+ *   The remaining reference count, 0 means entry be cleared, -1 on error.
  */
-void mlx5_l3t_clear_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx);
+int32_t mlx5_l3t_clear_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx);
 
 /**
- * This function gets the index entry from Three-level table.
+ * This function sets the index entry to Three-level table.
+ * If the entry is already set, the EEXIST errno will be given, and
+ * the set data will be filled to the data.
  *
- * @param tbl
+ * @param tbl[in]
  *   Pointer to the l3t.
- * @param idx
+ * @param idx[in]
  *   Index to the entry.
- * @param data
+ * @param data[in/out]
  *   Pointer to the memory which contains the entry data save to l3t.
+ *   If the entry is already set, the set data will be filled.
  *
  * @return
  *   0 if success, -1 on error.
  */
-uint32_t mlx5_l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+int32_t mlx5_l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
 			    union mlx5_l3t_data *data);
 
 /*
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH 6/6] net/mlx5: make shared counters thread safe
  2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
                   ` (4 preceding siblings ...)
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 5/6] net/mlx5: make three level table thread safe Suanming Mou
@ 2020-10-06 11:38 ` Suanming Mou
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
  6 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-06 11:38 UTC (permalink / raw)
  To: viacheslavo, matan; +Cc: rasland, dev

The shared counters save the counter index to three level table. As
three level table supports multiple-thread opertations now, the shared
counters can take advantage of the table to support multiple-thread.

Once multiple threads saves the same ID counter to the same table entry
at the same time, only one will be success, others will get the EEXIST
errno with entry reference counter increase. In this case, the other
duplicate created counter will be released.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/net/mlx5/mlx5.h            |  1 -
 drivers/net/mlx5/mlx5_flow_dv.c    | 62 +++++++++++---------------------------
 drivers/net/mlx5/mlx5_flow_verbs.c | 19 +++---------
 3 files changed, 21 insertions(+), 61 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8c951e2..6e0b2e2 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -348,7 +348,6 @@ struct flow_counter_stats {
 
 /* Shared counters information for counters. */
 struct mlx5_flow_counter_shared {
-	uint32_t ref_cnt; /**< Reference counter. */
 	uint32_t id; /**< User counter ID. */
 };
 
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 31d7fe4..3adb905 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4461,28 +4461,6 @@ struct field_modify_info modify_tcp[] = {
 }
 
 /**
- * Search for existed shared counter.
- *
- * @param[in] dev
- *   Pointer to the Ethernet device structure.
- * @param[in] id
- *   The shared counter ID to search.
- *
- * @return
- *   0 if not existed, otherwise shared counter index.
- */
-static uint32_t
-flow_dv_counter_shared_search(struct rte_eth_dev *dev, uint32_t id)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	union mlx5_l3t_data data;
-
-	if (mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data))
-		return 0;
-	return data.dword;
-}
-
-/**
  * Allocate a flow counter.
  *
  * @param[in] dev
@@ -4510,24 +4488,15 @@ struct field_modify_info modify_tcp[] = {
 	enum mlx5_counter_type cnt_type =
 			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
 	uint32_t cnt_idx;
+	union mlx5_l3t_data data;
 
 	if (!priv->config.devx) {
 		rte_errno = ENOTSUP;
 		return 0;
 	}
-	if (shared) {
-		cnt_idx = flow_dv_counter_shared_search(dev, id);
-		if (cnt_idx) {
-			cnt_free = flow_dv_counter_get_by_idx(dev, cnt_idx,
-							      NULL);
-			if (cnt_free->shared_info.ref_cnt + 1 == 0) {
-				rte_errno = E2BIG;
-				return 0;
-			}
-			cnt_free->shared_info.ref_cnt++;
-			return cnt_idx;
-		}
-	}
+	if (shared && !mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data) &&
+	    data.dword)
+		return data.dword;
 	/* Get free counters from container. */
 	rte_spinlock_lock(&cmng->csl);
 	cnt_free = TAILQ_FIRST(&cmng->counters[cnt_type]);
@@ -4566,12 +4535,18 @@ struct field_modify_info modify_tcp[] = {
 				 &cnt_free->bytes))
 		goto err;
 	if (shared) {
-		union mlx5_l3t_data data;
-
 		data.dword = cnt_idx;
-		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
+		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data)) {
+			if (rte_errno == EEXIST) {
+				cnt_free->pool = pool;
+				rte_spinlock_lock(&cmng->csl);
+				TAILQ_INSERT_TAIL(&cmng->counters[cnt_type],
+						  cnt_free, next);
+				rte_spinlock_unlock(&cmng->csl);
+				return data.dword;
+			}
 			goto err;
-		cnt_free->shared_info.ref_cnt = 1;
+		}
 		cnt_free->shared_info.id = id;
 		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
@@ -4667,12 +4642,9 @@ struct field_modify_info modify_tcp[] = {
 		return;
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (IS_SHARED_CNT(counter)) {
-		if (--cnt->shared_info.ref_cnt)
-			return;
-		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
-				     cnt->shared_info.id);
-	}
+	if (IS_SHARED_CNT(counter) &&
+	    mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl, cnt->shared_info.id))
+		return;
 	if (IS_AGE_POOL(pool))
 		flow_dv_counter_remove_from_age(dev, counter, cnt);
 	cnt->pool = pool;
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 69ecc27..f351a68 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -265,15 +265,8 @@
 	int ret;
 
 	if (shared && !mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data) &&
-	    data.dword) {
-		cnt = flow_verbs_counter_get_by_idx(dev, data.dword, NULL);
-		if (cnt->shared_info.ref_cnt + 1 == 0) {
-			rte_errno = E2BIG;
-			return 0;
-		}
-		cnt->shared_info.ref_cnt++;
+	    data.dword)
 		return data.dword;
-	}
 	for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
 		pool = cmng->pools[pool_idx];
 		if (!pool)
@@ -325,7 +318,6 @@
 		data.dword = cnt_idx;
 		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
 			return 0;
-		cnt->shared_info.ref_cnt = 1;
 		cnt->shared_info.id = id;
 		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
@@ -360,12 +352,9 @@
 	struct mlx5_flow_counter_ext *cnt_ext;
 
 	cnt = flow_verbs_counter_get_by_idx(dev, counter, &pool);
-	if (IS_SHARED_CNT(counter)) {
-		if (--cnt->shared_info.ref_cnt)
-			return;
-		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
-				     cnt->shared_info.id);
-	}
+	if (IS_SHARED_CNT(counter) &&
+	    mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl, cnt->shared_info.id))
+		return;
 	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 	claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter thread safe
  2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
                   ` (5 preceding siblings ...)
  2020-10-06 11:38 ` [dpdk-dev] [PATCH 6/6] net/mlx5: make shared counters " Suanming Mou
@ 2020-10-20  3:02 ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 1/8] net/mlx5: locate aging pools in the general container Suanming Mou
                     ` (8 more replies)
  6 siblings, 9 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  Cc: dev

The mlx5 PMD is going to support multiple-thread flow operations.
This patchset makes the counter action to be thread safe.

Suanming Mou (8):
  net/mlx5: locate aging pools in the general container
  net/mlx5: optimize shared counter memory
  net/mlx5: remove single counter container
  net/mlx5: synchronize flow counter pool creation
  net/mlx5: make three level table thread safe
  net/mlx5: make shared counters thread safe
  net/mlx5: rename flow counter macro
  net/mlx5: optimize counter extend memory

--

v2:
 - Optimize the fallback non-batch counter memory.
 - Add MLX5_ prefix to counter macro.
 - Rebase on top of the latest code.

--

 drivers/net/mlx5/linux/mlx5_os.c   |  53 ++-
 drivers/net/mlx5/mlx5.c            |  51 ++-
 drivers/net/mlx5/mlx5.h            | 146 ++++----
 drivers/net/mlx5/mlx5_flow.c       | 183 ++++++----
 drivers/net/mlx5/mlx5_flow.h       |   1 +
 drivers/net/mlx5/mlx5_flow_dv.c    | 706 ++++++++++++++-----------------------
 drivers/net/mlx5/mlx5_flow_verbs.c | 130 ++++---
 drivers/net/mlx5/mlx5_utils.c      | 191 ++++++++--
 drivers/net/mlx5/mlx5_utils.h      |  81 ++++-
 9 files changed, 824 insertions(+), 718 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 1/8] net/mlx5: locate aging pools in the general container
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 2/8] net/mlx5: optimize shared counter memory Suanming Mou
                     ` (7 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

Commit [1] introduced different container for the aging counter
pools. In order to save container memory the aging counter pools
can be located in the general pool container.

This patch locates the aging counter pools in the general pool
container. Remove the aging container management.

[1] commit fd143711a6ea ("net/mlx5: separate aging counter pool range")

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/mlx5.c            |  7 ++--
 drivers/net/mlx5/mlx5.h            | 17 +++++----
 drivers/net/mlx5/mlx5_flow.c       | 19 +++-------
 drivers/net/mlx5/mlx5_flow_dv.c    | 76 ++++++++++++++++++--------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |  4 +-
 5 files changed, 56 insertions(+), 67 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 74a537b..a305e37 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -480,7 +480,7 @@ struct mlx5_flow_id_pool *
 static void
 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
 {
-	int i;
+	int i, j;
 
 	memset(&sh->cmng, 0, sizeof(sh->cmng));
 	TAILQ_INIT(&sh->cmng.flow_counters);
@@ -490,7 +490,8 @@ struct mlx5_flow_id_pool *
 		sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
 		TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
 		rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
-		TAILQ_INIT(&sh->cmng.ccont[i].counters);
+		for (j = 0; j < MLX5_COUNTER_TYPE_MAX; j++)
+			TAILQ_INIT(&sh->cmng.ccont[i].counters[j]);
 		rte_spinlock_init(&sh->cmng.ccont[i].csl);
 	}
 }
@@ -535,7 +536,7 @@ struct mlx5_flow_id_pool *
 	}
 	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
 		struct mlx5_flow_counter_pool *pool;
-		uint32_t batch = !!(i > 1);
+		uint32_t batch = (i == MLX5_CCONT_TYPE_BATCH);
 
 		if (!sh->cmng.ccont[i].pools)
 			continue;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index afa2f31..26c603b 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -272,7 +272,6 @@ struct mlx5_drop {
 #define MLX5_COUNTERS_PER_POOL 512
 #define MLX5_MAX_PENDING_QUERIES 4
 #define MLX5_CNT_CONTAINER_RESIZE 64
-#define MLX5_CNT_AGE_OFFSET 0x80000000
 #define CNT_SIZE (sizeof(struct mlx5_flow_counter))
 #define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
 #define AGE_SIZE (sizeof(struct mlx5_age_param))
@@ -280,7 +279,6 @@ struct mlx5_drop {
 #define CNT_POOL_TYPE_AGE	(1 << 1)
 #define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
 #define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
-#define MLX_CNT_IS_AGE(counter) ((counter) & MLX5_CNT_AGE_OFFSET ? 1 : 0)
 #define MLX5_CNT_LEN(pool) \
 	(CNT_SIZE + \
 	(IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
@@ -321,17 +319,20 @@ enum {
 	AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */
 };
 
-#define MLX5_CNT_CONTAINER(sh, batch, age) (&(sh)->cmng.ccont \
-					    [(batch) * 2 + (age)])
+#define MLX5_CNT_CONTAINER(sh, batch) (&(sh)->cmng.ccont[batch])
 
 enum {
 	MLX5_CCONT_TYPE_SINGLE,
-	MLX5_CCONT_TYPE_SINGLE_FOR_AGE,
 	MLX5_CCONT_TYPE_BATCH,
-	MLX5_CCONT_TYPE_BATCH_FOR_AGE,
 	MLX5_CCONT_TYPE_MAX,
 };
 
+enum mlx5_counter_type {
+	MLX5_COUNTER_TYPE_ORIGIN,
+	MLX5_COUNTER_TYPE_AGE,
+	MLX5_COUNTER_TYPE_MAX,
+};
+
 /* Counter age parameter. */
 struct mlx5_age_param {
 	uint16_t state; /**< Age state (atomically accessed). */
@@ -426,7 +427,8 @@ struct mlx5_pools_container {
 	int max_id; /* The maximum counter ID in the pools. */
 	rte_spinlock_t resize_sl; /* The resize lock. */
 	rte_spinlock_t csl; /* The counter free list lock. */
-	struct mlx5_counters counters; /* Free counter list. */
+	struct mlx5_counters counters[MLX5_COUNTER_TYPE_MAX];
+	/* Free counter list. */
 	struct mlx5_counter_pools pool_list; /* Counter pool list. */
 	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
 	struct mlx5_counter_stats_mem_mng *mem_mng;
@@ -440,7 +442,6 @@ struct mlx5_flow_counter_mng {
 	uint8_t pending_queries;
 	uint8_t batch;
 	uint16_t pool_index;
-	uint8_t age;
 	uint8_t query_thread_on;
 	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
 	LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c56dac8..598422c 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -6649,7 +6649,6 @@ struct mlx5_meter_domains_infos *
 	uint16_t offset;
 	int ret;
 	uint8_t batch = sh->cmng.batch;
-	uint8_t age = sh->cmng.age;
 	uint16_t pool_index = sh->cmng.pool_index;
 	struct mlx5_pools_container *cont;
 	struct mlx5_flow_counter_pool *pool;
@@ -6658,7 +6657,7 @@ struct mlx5_meter_domains_infos *
 	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
 		goto set_alarm;
 next_container:
-	cont = MLX5_CNT_CONTAINER(sh, batch, age);
+	cont = MLX5_CNT_CONTAINER(sh, batch);
 	rte_spinlock_lock(&cont->resize_sl);
 	if (!cont->pools) {
 		rte_spinlock_unlock(&cont->resize_sl);
@@ -6667,11 +6666,6 @@ struct mlx5_meter_domains_infos *
 			goto set_alarm;
 		batch ^= 0x1;
 		pool_index = 0;
-		if (batch == 0 && pool_index == 0) {
-			age ^= 0x1;
-			sh->cmng.batch = batch;
-			sh->cmng.age = age;
-		}
 		goto next_container;
 	}
 	pool = cont->pools[pool_index];
@@ -6720,13 +6714,10 @@ struct mlx5_meter_domains_infos *
 	if (pool_index >= rte_atomic16_read(&cont->n_valid)) {
 		batch ^= 0x1;
 		pool_index = 0;
-		if (batch == 0 && pool_index == 0)
-			age ^= 0x1;
 	}
 set_alarm:
 	sh->cmng.batch = batch;
 	sh->cmng.pool_index = pool_index;
-	sh->cmng.age = age;
 	mlx5_set_query_alarm(sh);
 }
 
@@ -6817,10 +6808,12 @@ struct mlx5_meter_domains_infos *
 	struct mlx5_flow_counter_pool *pool =
 		(struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
 	struct mlx5_counter_stats_raw *raw_to_free;
-	uint8_t age = !!IS_AGE_POOL(pool);
 	uint8_t query_gen = pool->query_gen ^ 1;
 	struct mlx5_pools_container *cont =
-		MLX5_CNT_CONTAINER(sh, !IS_EXT_POOL(pool), age);
+		MLX5_CNT_CONTAINER(sh, !IS_EXT_POOL(pool));
+	enum mlx5_counter_type cnt_type =
+		IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
+				    MLX5_COUNTER_TYPE_ORIGIN;
 
 	if (unlikely(status)) {
 		raw_to_free = pool->raw_hw;
@@ -6835,7 +6828,7 @@ struct mlx5_meter_domains_infos *
 		rte_io_wmb();
 		if (!TAILQ_EMPTY(&pool->counters[query_gen])) {
 			rte_spinlock_lock(&cont->csl);
-			TAILQ_CONCAT(&cont->counters,
+			TAILQ_CONCAT(&cont->counters[cnt_type],
 				     &pool->counters[query_gen], next);
 			rte_spinlock_unlock(&cont->csl);
 		}
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index d3a3f23..90b98cc 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4605,16 +4605,14 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_pools_container *cont;
 	struct mlx5_flow_counter_pool *pool;
-	uint32_t batch = 0, age = 0;
+	uint32_t batch = 0;
 
 	idx--;
-	age = MLX_CNT_IS_AGE(idx);
-	idx = age ? idx - MLX5_CNT_AGE_OFFSET : idx;
 	if (idx >= MLX5_CNT_BATCH_OFFSET) {
 		idx -= MLX5_CNT_BATCH_OFFSET;
 		batch = 1;
 	}
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cont->n);
 	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
@@ -4767,19 +4765,15 @@ struct field_modify_info modify_tcp[] = {
  *   Pointer to the Ethernet device structure.
  * @param[in] batch
  *   Whether the pool is for counter that was allocated by batch command.
- * @param[in] age
- *   Whether the pool is for Aging counter.
  *
  * @return
  *   0 on success, otherwise negative errno value and rte_errno is set.
  */
 static int
-flow_dv_container_resize(struct rte_eth_dev *dev,
-				uint32_t batch, uint32_t age)
+flow_dv_container_resize(struct rte_eth_dev *dev, uint32_t batch)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
-							       age);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
 	void *old_pools = cont->pools;
 	uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE;
@@ -4897,12 +4891,11 @@ struct field_modify_info modify_tcp[] = {
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
-							       age);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	int16_t n_valid = rte_atomic16_read(&cont->n_valid);
 	uint32_t size = sizeof(*pool);
 
-	if (cont->n == n_valid && flow_dv_container_resize(dev, batch, age))
+	if (cont->n == n_valid && flow_dv_container_resize(dev, batch))
 		return NULL;
 	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
 	size += (batch ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
@@ -5031,10 +5024,12 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_devx_obj *last_min_dcs;
 	struct mlx5_devx_obj *dcs = NULL;
 	struct mlx5_flow_counter *cnt;
+	enum mlx5_counter_type cnt_type =
+			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
 	uint32_t add2other;
 	uint32_t i;
 
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch, age);
+	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
 	if (!batch) {
 retry:
 		add2other = 0;
@@ -5043,25 +5038,20 @@ struct field_modify_info modify_tcp[] = {
 		if (!dcs)
 			return NULL;
 		pool = flow_dv_find_pool_by_id(cont, dcs->id);
-		/* Check if counter belongs to exist pool ID range. */
+		/*
+		 * If pool eixsts but with other type, counter will be added
+		 * to the other pool, need to reallocate new counter in the
+		 * ragne with same type later.
+		 */
 		if (!pool) {
-			pool = flow_dv_find_pool_by_id
-			       (MLX5_CNT_CONTAINER
-			       (priv->sh, batch, (age ^ 0x1)), dcs->id);
-			/*
-			 * Pool exists, counter will be added to the other
-			 * container, need to reallocate it later.
-			 */
-			if (pool) {
-				add2other = 1;
-			} else {
-				pool = flow_dv_pool_create(dev, dcs, batch,
-							   age);
-				if (!pool) {
-					mlx5_devx_cmd_destroy(dcs);
-					return NULL;
-				}
+			pool = flow_dv_pool_create(dev, dcs, batch,
+						   age);
+			if (!pool) {
+				mlx5_devx_cmd_destroy(dcs);
+				return NULL;
 			}
+		} else if ((!!IS_AGE_POOL(pool)) != age) {
+			add2other = 1;
 		}
 		if ((dcs->id < pool->min_dcs->id ||
 		    pool->min_dcs->id &
@@ -5128,7 +5118,7 @@ struct field_modify_info modify_tcp[] = {
 		TAILQ_INSERT_HEAD(&tmp_tq, cnt, next);
 	}
 	rte_spinlock_lock(&cont->csl);
-	TAILQ_CONCAT(&cont->counters, &tmp_tq, next);
+	TAILQ_CONCAT(&cont->counters[cnt_type], &tmp_tq, next);
 	rte_spinlock_unlock(&cont->csl);
 	*cnt_free = MLX5_POOL_GET_CNT(pool, 0);
 	(*cnt_free)->pool = pool;
@@ -5201,8 +5191,9 @@ struct field_modify_info modify_tcp[] = {
 	 * shared counters from the single container.
 	 */
 	uint32_t batch = (group && !shared && !priv->counter_fallback) ? 1 : 0;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch,
-							       age);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
+	enum mlx5_counter_type cnt_type =
+			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
 	uint32_t cnt_idx;
 
 	if (!priv->config.devx) {
@@ -5225,9 +5216,9 @@ struct field_modify_info modify_tcp[] = {
 	}
 	/* Get free counters from container. */
 	rte_spinlock_lock(&cont->csl);
-	cnt_free = TAILQ_FIRST(&cont->counters);
+	cnt_free = TAILQ_FIRST(&cont->counters[cnt_type]);
 	if (cnt_free)
-		TAILQ_REMOVE(&cont->counters, cnt_free, next);
+		TAILQ_REMOVE(&cont->counters[cnt_type], cnt_free, next);
 	rte_spinlock_unlock(&cont->csl);
 	if (!cnt_free && !flow_dv_counter_pool_prepare(dev, &cnt_free,
 						       batch, age))
@@ -5258,7 +5249,6 @@ struct field_modify_info modify_tcp[] = {
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool->index,
 				MLX5_CNT_ARRAY_IDX(pool, cnt_free));
 	cnt_idx += batch * MLX5_CNT_BATCH_OFFSET;
-	cnt_idx += age * MLX5_CNT_AGE_OFFSET;
 	/* Update the counter reset values. */
 	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
 				 &cnt_free->bytes))
@@ -5283,7 +5273,7 @@ struct field_modify_info modify_tcp[] = {
 	if (cnt_free) {
 		cnt_free->pool = pool;
 		rte_spinlock_lock(&cont->csl);
-		TAILQ_INSERT_TAIL(&cont->counters, cnt_free, next);
+		TAILQ_INSERT_TAIL(&cont->counters[cnt_type], cnt_free, next);
 		rte_spinlock_unlock(&cont->csl);
 	}
 	return 0;
@@ -5363,6 +5353,7 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
+	enum mlx5_counter_type cnt_type;
 
 	if (!counter)
 		return;
@@ -5391,12 +5382,15 @@ struct field_modify_info modify_tcp[] = {
 	 * function both operate with the different list.
 	 *
 	 */
-	if (!priv->counter_fallback)
+	if (!priv->counter_fallback) {
 		TAILQ_INSERT_TAIL(&pool->counters[pool->query_gen], cnt, next);
-	else
+	} else {
+		cnt_type = IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
+					       MLX5_COUNTER_TYPE_ORIGIN;
 		TAILQ_INSERT_TAIL(&((MLX5_CNT_CONTAINER
-				  (priv->sh, 0, 0))->counters),
+				  (priv->sh, 0))->counters[cnt_type]),
 				  cnt, next);
+	}
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 276bcb5..698fb2b 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -159,7 +159,7 @@
 			      struct mlx5_flow_counter_pool **ppool)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
 	struct mlx5_flow_counter_pool *pool;
 
 	idx--;
@@ -254,7 +254,7 @@
 flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0, 0);
+	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 2/8] net/mlx5: optimize shared counter memory
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 1/8] net/mlx5: locate aging pools in the general container Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 3/8] net/mlx5: remove single counter container Suanming Mou
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

Instead of using special memory to indicate shared counter, this patch does
the optimization to use the counter handler reserved memory to indicate it.
The counter index with MLX5_CNT_SHARED_OFFSET means the shared counter.

This patch is also an arrangement for a new adjustment to use batch counter
as shared counter.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/mlx5.h            | 36 +++++++++++++-----
 drivers/net/mlx5/mlx5_flow_dv.c    | 78 +++++++++++++++-----------------------
 drivers/net/mlx5/mlx5_flow_verbs.c | 60 +++++++++++++++++------------
 3 files changed, 93 insertions(+), 81 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 26c603b..e3ac07f 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -272,6 +272,10 @@ struct mlx5_drop {
 #define MLX5_COUNTERS_PER_POOL 512
 #define MLX5_MAX_PENDING_QUERIES 4
 #define MLX5_CNT_CONTAINER_RESIZE 64
+#define MLX5_CNT_SHARED_OFFSET 0x80000000
+#define IS_SHARED_CNT(cnt) (!!((cnt) & MLX5_CNT_SHARED_OFFSET))
+#define IS_BATCH_CNT(cnt) (((cnt) & (MLX5_CNT_SHARED_OFFSET - 1)) >= \
+			   MLX5_CNT_BATCH_OFFSET)
 #define CNT_SIZE (sizeof(struct mlx5_flow_counter))
 #define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
 #define AGE_SIZE (sizeof(struct mlx5_age_param))
@@ -348,10 +352,29 @@ struct flow_counter_stats {
 	uint64_t bytes;
 };
 
+/* Shared counters information for counters. */
+struct mlx5_flow_counter_shared {
+	uint32_t ref_cnt; /**< Reference counter. */
+	uint32_t id; /**< User counter ID. */
+};
+
+struct mlx5_flow_counter_pool;
 /* Generic counters information. */
 struct mlx5_flow_counter {
-	TAILQ_ENTRY(mlx5_flow_counter) next;
-	/**< Pointer to the next flow counter structure. */
+	union {
+		/*
+		 * User-defined counter shared info is only used during
+		 * counter active time. And aging counter sharing is not
+		 * supported, so active shared counter will not be chained
+		 * to the aging list. For shared counter, only when it is
+		 * released, the TAILQ entry memory will be used, at that
+		 * time, shared memory is not used anymore.
+		 */
+		TAILQ_ENTRY(mlx5_flow_counter) next;
+		/**< Pointer to the next flow counter structure. */
+		struct mlx5_flow_counter_shared shared_info;
+		/**< Shared counter information. */
+	};
 	union {
 		uint64_t hits; /**< Reset value of hits packets. */
 		struct mlx5_flow_counter_pool *pool; /**< Counter pool. */
@@ -360,15 +383,10 @@ struct mlx5_flow_counter {
 	void *action; /**< Pointer to the dv action. */
 };
 
-/* Extend counters information for none batch counters. */
+/* Extend counters information for none batch fallback counters. */
 struct mlx5_flow_counter_ext {
-	uint32_t shared:1; /**< Share counter ID with other flow rules. */
-	uint32_t batch: 1;
 	uint32_t skipped:1; /* This counter is skipped or not. */
-	/**< Whether the counter was allocated by batch command. */
-	uint32_t ref_cnt:29; /**< Reference counter. */
-	uint32_t id; /**< User counter ID. */
-	union {  /**< Holds the counters for the rule. */
+	union {
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 		struct ibv_counter_set *cs;
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 90b98cc..b16db1d 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4607,8 +4607,9 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_flow_counter_pool *pool;
 	uint32_t batch = 0;
 
-	idx--;
-	if (idx >= MLX5_CNT_BATCH_OFFSET) {
+	/* Decrease to original index and clear shared bit. */
+	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
+	if (IS_BATCH_CNT(idx)) {
 		idx -= MLX5_CNT_BATCH_OFFSET;
 		batch = 1;
 	}
@@ -4843,7 +4844,7 @@ struct field_modify_info modify_tcp[] = {
 
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (counter < MLX5_CNT_BATCH_OFFSET) {
+	if (!IS_BATCH_CNT(counter)) {
 		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
 		if (priv->counter_fallback)
 			return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
@@ -5132,29 +5133,19 @@ struct field_modify_info modify_tcp[] = {
  *   Pointer to the Ethernet device structure.
  * @param[in] id
  *   The shared counter ID to search.
- * @param[out] ppool
- *   mlx5 flow counter pool in the container,
  *
  * @return
- *   NULL if not existed, otherwise pointer to the shared extend counter.
+ *   0 if not existed, otherwise shared counter index.
  */
-static struct mlx5_flow_counter_ext *
-flow_dv_counter_shared_search(struct rte_eth_dev *dev, uint32_t id,
-			      struct mlx5_flow_counter_pool **ppool)
+static uint32_t
+flow_dv_counter_shared_search(struct rte_eth_dev *dev, uint32_t id)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	union mlx5_l3t_data data;
-	uint32_t cnt_idx;
 
-	if (mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data) || !data.dword)
-		return NULL;
-	cnt_idx = data.dword;
-	/*
-	 * Shared counters don't have age info. The counter extend is after
-	 * the counter datat structure.
-	 */
-	return (struct mlx5_flow_counter_ext *)
-	       ((flow_dv_counter_get_by_idx(dev, cnt_idx, ppool)) + 1);
+	if (mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data))
+		return 0;
+	return data.dword;
 }
 
 /**
@@ -5201,16 +5192,15 @@ struct field_modify_info modify_tcp[] = {
 		return 0;
 	}
 	if (shared) {
-		cnt_ext = flow_dv_counter_shared_search(dev, id, &pool);
-		if (cnt_ext) {
-			if (cnt_ext->ref_cnt + 1 == 0) {
+		cnt_idx = flow_dv_counter_shared_search(dev, id);
+		if (cnt_idx) {
+			cnt_free = flow_dv_counter_get_by_idx(dev, cnt_idx,
+							      NULL);
+			if (cnt_free->shared_info.ref_cnt + 1 == 0) {
 				rte_errno = E2BIG;
 				return 0;
 			}
-			cnt_ext->ref_cnt++;
-			cnt_idx = pool->index * MLX5_COUNTERS_PER_POOL +
-				  (cnt_ext->dcs->id % MLX5_COUNTERS_PER_POOL)
-				  + 1;
+			cnt_free->shared_info.ref_cnt++;
 			return cnt_idx;
 		}
 	}
@@ -5253,17 +5243,15 @@ struct field_modify_info modify_tcp[] = {
 	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
 				 &cnt_free->bytes))
 		goto err;
-	if (cnt_ext) {
-		cnt_ext->shared = shared;
-		cnt_ext->ref_cnt = 1;
-		cnt_ext->id = id;
-		if (shared) {
-			union mlx5_l3t_data data;
-
-			data.dword = cnt_idx;
-			if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
-				return 0;
-		}
+	if (shared) {
+		union mlx5_l3t_data data;
+
+		data.dword = cnt_idx;
+		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
+			goto err;
+		cnt_free->shared_info.ref_cnt = 1;
+		cnt_free->shared_info.id = id;
+		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
 	if (!priv->counter_fallback && !priv->sh->cmng.query_thread_on)
 		/* Start the asynchronous batch query by the host thread. */
@@ -5352,22 +5340,18 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt;
-	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	enum mlx5_counter_type cnt_type;
 
 	if (!counter)
 		return;
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (counter < MLX5_CNT_BATCH_OFFSET) {
-		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-		if (cnt_ext) {
-			if (--cnt_ext->ref_cnt)
-				return;
-			if (cnt_ext->shared)
-				mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
-						     cnt_ext->id);
-		}
+
+	if (IS_SHARED_CNT(counter)) {
+		if (--cnt->shared_info.ref_cnt)
+			return;
+		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
+				     cnt->shared_info.id);
 	}
 	if (IS_AGE_POOL(pool))
 		flow_dv_counter_remove_from_age(dev, counter, cnt);
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 698fb2b..bda55c2 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -162,7 +162,7 @@
 	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
 	struct mlx5_flow_counter_pool *pool;
 
-	idx--;
+	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
 	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
 	if (ppool)
@@ -258,22 +258,21 @@
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
+	union mlx5_l3t_data data;
 	uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
-	uint32_t pool_idx;
+	uint32_t pool_idx, cnt_idx;
 	uint32_t i;
 	int ret;
 
-	if (shared) {
-		for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
-			pool = cont->pools[pool_idx];
-			for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
-				cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
-				if (cnt_ext->shared && cnt_ext->id == id) {
-					cnt_ext->ref_cnt++;
-					return MLX5_MAKE_CNT_IDX(pool_idx, i);
-				}
-			}
+	if (shared && !mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data) &&
+	    data.dword) {
+		cnt = flow_verbs_counter_get_by_idx(dev, data.dword, NULL);
+		if (cnt->shared_info.ref_cnt + 1 == 0) {
+			rte_errno = E2BIG;
+			return 0;
 		}
+		cnt->shared_info.ref_cnt++;
+		return data.dword;
 	}
 	for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
 		pool = cont->pools[pool_idx];
@@ -322,17 +321,23 @@
 		TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
 	}
 	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
+	cnt_idx = MLX5_MAKE_CNT_IDX(pool_idx, i);
+	if (shared) {
+		data.dword = cnt_idx;
+		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
+			return 0;
+		cnt->shared_info.ref_cnt = 1;
+		cnt->shared_info.id = id;
+		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
+	}
 	cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
-	cnt_ext->id = id;
-	cnt_ext->shared = shared;
-	cnt_ext->ref_cnt = 1;
 	cnt->hits = 0;
 	cnt->bytes = 0;
 	/* Create counter with Verbs. */
 	ret = flow_verbs_counter_create(dev, cnt_ext);
 	if (!ret) {
 		TAILQ_REMOVE(&pool->counters[0], cnt, next);
-		return MLX5_MAKE_CNT_IDX(pool_idx, i);
+		return cnt_idx;
 	}
 	/* Some error occurred in Verbs library. */
 	rte_errno = -ret;
@@ -350,23 +355,28 @@
 static void
 flow_verbs_counter_release(struct rte_eth_dev *dev, uint32_t counter)
 {
+	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_flow_counter *cnt;
 	struct mlx5_flow_counter_ext *cnt_ext;
 
-	cnt = flow_verbs_counter_get_by_idx(dev, counter,
-					    &pool);
+	cnt = flow_verbs_counter_get_by_idx(dev, counter, &pool);
+	if (IS_SHARED_CNT(counter)) {
+		if (--cnt->shared_info.ref_cnt)
+			return;
+		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
+				     cnt->shared_info.id);
+	}
 	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-	if (--cnt_ext->ref_cnt == 0) {
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
-		claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
-		cnt_ext->cs = NULL;
+	claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
+	cnt_ext->cs = NULL;
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
-		claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs));
-		cnt_ext->cs = NULL;
+	claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs));
+	cnt_ext->cs = NULL;
 #endif
-		TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
-	}
+	(void)cnt_ext;
+	TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
 }
 
 /**
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 3/8] net/mlx5: remove single counter container
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 1/8] net/mlx5: locate aging pools in the general container Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 2/8] net/mlx5: optimize shared counter memory Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 4/8] net/mlx5: synchronize flow counter pool creation Suanming Mou
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

A flow counter which was allocated by a batch API couldn't be assigned
to a flow in the root table (group 0) in old rdma-core version.
Hence, a root table flow counter required PMD mechanism to manage
counters which were allocated singly.

Currently, the batch counters have already been supported in root table
includes a new rdma-core version with MLX5_FLOW_ACTION_COUNTER_OFFSET
enum and with a kernel driver includes
MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET enum.

When the PMD uses rdma-core API to assign a batch counter to a root
table flow using invalid counter offset, it should get an error only
if the batch counter assignment for root table is supported.
Using this trial in the initialization time can help to detect the
support.

Using the above trial, if the support is valid, remove the management of
single counter container in the fast counter mechanism. Otherwise, move
the counter mechanism to fallback mode.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c   |  42 +++-
 drivers/net/mlx5/mlx5.c            |  38 ++--
 drivers/net/mlx5/mlx5.h            |  28 +--
 drivers/net/mlx5/mlx5_flow.c       |  76 ++-----
 drivers/net/mlx5/mlx5_flow.h       |   1 +
 drivers/net/mlx5/mlx5_flow_dv.c    | 397 ++++++++++++++++---------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |  26 +--
 7 files changed, 258 insertions(+), 350 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index d95082f..fbd95e7 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -562,6 +562,39 @@
 }
 
 /**
+ * DV flow counter mode detect and config.
+ *
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
+ *
+ */
+static void
+mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	/* If devx is not supported or not DV mode, counters are not working. */
+	if (!priv->config.devx || !priv->config.dv_flow_en)
+		return;
+#ifndef HAVE_IBV_DEVX_ASYNC
+	priv->counter_fallback = 1;
+#else
+	priv->counter_fallback = 0;
+	if (!priv->config.hca_attr.flow_counters_dump ||
+	    !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) ||
+	    (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP))
+		priv->counter_fallback = 1;
+#endif
+	if (priv->counter_fallback)
+		DRV_LOG(INFO, "Use fall-back DV counter management. Flow "
+			"counter dump:%d, bulk_alloc_bitmap:0x%hhx.",
+			priv->config.hca_attr.flow_counters_dump,
+			priv->config.hca_attr.flow_counter_bulk_alloc_bitmap);
+#endif
+}
+
+/**
  * Spawn an Ethernet device from Verbs information.
  *
  * @param dpdk_dev
@@ -1029,19 +1062,11 @@
 		DRV_LOG(INFO, "Rx CQE padding is enabled");
 	}
 	if (config->devx) {
-		priv->counter_fallback = 0;
 		err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr);
 		if (err) {
 			err = -err;
 			goto error;
 		}
-		if (!config->hca_attr.flow_counters_dump)
-			priv->counter_fallback = 1;
-#ifndef HAVE_IBV_DEVX_ASYNC
-		priv->counter_fallback = 1;
-#endif
-		if (priv->counter_fallback)
-			DRV_LOG(INFO, "Use fall-back DV counter management");
 		/* Check for LRO support. */
 		if (config->dest_tir && config->hca_attr.lro_cap &&
 		    config->dv_flow_en) {
@@ -1443,6 +1468,7 @@
 			goto error;
 		}
 	}
+	mlx5_flow_counter_mode_config(eth_dev);
 	return eth_dev;
 error:
 	if (priv) {
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index a305e37..4d1ca9a 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -480,19 +480,18 @@ struct mlx5_flow_id_pool *
 static void
 mlx5_flow_counters_mng_init(struct mlx5_dev_ctx_shared *sh)
 {
-	int i, j;
+	int i;
 
 	memset(&sh->cmng, 0, sizeof(sh->cmng));
 	TAILQ_INIT(&sh->cmng.flow_counters);
-	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
-		sh->cmng.ccont[i].min_id = MLX5_CNT_BATCH_OFFSET;
-		sh->cmng.ccont[i].max_id = -1;
-		sh->cmng.ccont[i].last_pool_idx = POOL_IDX_INVALID;
-		TAILQ_INIT(&sh->cmng.ccont[i].pool_list);
-		rte_spinlock_init(&sh->cmng.ccont[i].resize_sl);
-		for (j = 0; j < MLX5_COUNTER_TYPE_MAX; j++)
-			TAILQ_INIT(&sh->cmng.ccont[i].counters[j]);
-		rte_spinlock_init(&sh->cmng.ccont[i].csl);
+	sh->cmng.min_id = MLX5_CNT_BATCH_OFFSET;
+	sh->cmng.max_id = -1;
+	sh->cmng.last_pool_idx = POOL_IDX_INVALID;
+	TAILQ_INIT(&sh->cmng.pool_list);
+	rte_spinlock_init(&sh->cmng.resize_sl);
+	for (i = 0; i < MLX5_COUNTER_TYPE_MAX; i++) {
+		TAILQ_INIT(&sh->cmng.counters[i]);
+		rte_spinlock_init(&sh->cmng.csl[i]);
 	}
 }
 
@@ -523,7 +522,6 @@ struct mlx5_flow_id_pool *
 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
 {
 	struct mlx5_counter_stats_mem_mng *mng;
-	int i;
 	int j;
 	int retries = 1024;
 
@@ -534,15 +532,13 @@ struct mlx5_flow_id_pool *
 			break;
 		rte_pause();
 	}
-	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i) {
+
+	if (sh->cmng.pools) {
 		struct mlx5_flow_counter_pool *pool;
-		uint32_t batch = (i == MLX5_CCONT_TYPE_BATCH);
 
-		if (!sh->cmng.ccont[i].pools)
-			continue;
-		pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+		pool = TAILQ_FIRST(&sh->cmng.pool_list);
 		while (pool) {
-			if (batch && pool->min_dcs)
+			if (!IS_EXT_POOL(pool) && pool->min_dcs)
 				claim_zero(mlx5_devx_cmd_destroy
 							       (pool->min_dcs));
 			for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
@@ -551,17 +547,17 @@ struct mlx5_flow_id_pool *
 					 (mlx5_glue->destroy_flow_action
 					  (MLX5_POOL_GET_CNT
 					  (pool, j)->action));
-				if (!batch && MLX5_GET_POOL_CNT_EXT
+				if (IS_EXT_POOL(pool) && MLX5_GET_POOL_CNT_EXT
 				    (pool, j)->dcs)
 					claim_zero(mlx5_devx_cmd_destroy
 						   (MLX5_GET_POOL_CNT_EXT
 						    (pool, j)->dcs));
 			}
-			TAILQ_REMOVE(&sh->cmng.ccont[i].pool_list, pool, next);
+			TAILQ_REMOVE(&sh->cmng.pool_list, pool, next);
 			mlx5_free(pool);
-			pool = TAILQ_FIRST(&sh->cmng.ccont[i].pool_list);
+			pool = TAILQ_FIRST(&sh->cmng.pool_list);
 		}
-		mlx5_free(sh->cmng.ccont[i].pools);
+		mlx5_free(sh->cmng.pools);
 	}
 	mng = LIST_FIRST(&sh->cmng.mem_mngs);
 	while (mng) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e3ac07f..78cdac3 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -281,8 +281,10 @@ struct mlx5_drop {
 #define AGE_SIZE (sizeof(struct mlx5_age_param))
 #define CNT_POOL_TYPE_EXT	(1 << 0)
 #define CNT_POOL_TYPE_AGE	(1 << 1)
+
 #define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
 #define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
+
 #define MLX5_CNT_LEN(pool) \
 	(CNT_SIZE + \
 	(IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
@@ -323,14 +325,6 @@ enum {
 	AGE_TMOUT, /* Timeout, wait for rte_flow_get_aged_flows and destroy. */
 };
 
-#define MLX5_CNT_CONTAINER(sh, batch) (&(sh)->cmng.ccont[batch])
-
-enum {
-	MLX5_CCONT_TYPE_SINGLE,
-	MLX5_CCONT_TYPE_BATCH,
-	MLX5_CCONT_TYPE_MAX,
-};
-
 enum mlx5_counter_type {
 	MLX5_COUNTER_TYPE_ORIGIN,
 	MLX5_COUNTER_TYPE_AGE,
@@ -385,7 +379,6 @@ struct mlx5_flow_counter {
 
 /* Extend counters information for none batch fallback counters. */
 struct mlx5_flow_counter_ext {
-	uint32_t skipped:1; /* This counter is skipped or not. */
 	union {
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 		struct ibv_counter_set *cs;
@@ -409,9 +402,8 @@ struct mlx5_flow_counter_pool {
 	/* The devx object of the minimum counter ID. */
 	uint64_t time_of_last_age_check;
 	/* System time (from rte_rdtsc()) read in the last aging check. */
-	uint32_t index:28; /* Pool index in container. */
+	uint32_t index:29; /* Pool index in container. */
 	uint32_t type:2; /* Memory type behind the counter array. */
-	uint32_t skip_cnt:1; /* Pool contains skipped counter. */
 	volatile uint32_t query_gen:1; /* Query round. */
 	rte_spinlock_t sl; /* The pool lock. */
 	struct mlx5_counter_stats_raw *raw;
@@ -429,36 +421,30 @@ struct mlx5_counter_stats_mem_mng {
 /* Raw memory structure for the counter statistics values of a pool. */
 struct mlx5_counter_stats_raw {
 	LIST_ENTRY(mlx5_counter_stats_raw) next;
-	int min_dcs_id;
 	struct mlx5_counter_stats_mem_mng *mem_mng;
 	volatile struct flow_counter_stats *data;
 };
 
 TAILQ_HEAD(mlx5_counter_pools, mlx5_flow_counter_pool);
 
-/* Container structure for counter pools. */
-struct mlx5_pools_container {
+/* Counter global management structure. */
+struct mlx5_flow_counter_mng {
 	rte_atomic16_t n_valid; /* Number of valid pools. */
 	uint16_t n; /* Number of pools. */
 	uint16_t last_pool_idx; /* Last used pool index */
 	int min_id; /* The minimum counter ID in the pools. */
 	int max_id; /* The maximum counter ID in the pools. */
 	rte_spinlock_t resize_sl; /* The resize lock. */
-	rte_spinlock_t csl; /* The counter free list lock. */
+	rte_spinlock_t csl[MLX5_COUNTER_TYPE_MAX];
+	/* The counter free list lock. */
 	struct mlx5_counters counters[MLX5_COUNTER_TYPE_MAX];
 	/* Free counter list. */
 	struct mlx5_counter_pools pool_list; /* Counter pool list. */
 	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
 	struct mlx5_counter_stats_mem_mng *mem_mng;
 	/* Hold the memory management for the next allocated pools raws. */
-};
-
-/* Counter global management structure. */
-struct mlx5_flow_counter_mng {
-	struct mlx5_pools_container ccont[MLX5_CCONT_TYPE_MAX];
 	struct mlx5_counters flow_counters; /* Legacy flow counter list. */
 	uint8_t pending_queries;
-	uint8_t batch;
 	uint16_t pool_index;
 	uint8_t query_thread_on;
 	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 598422c..dae7ac3 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -6592,26 +6592,6 @@ struct mlx5_meter_domains_infos *
 #define MLX5_POOL_QUERY_FREQ_US 1000000
 
 /**
- * Get number of all validate pools.
- *
- * @param[in] sh
- *   Pointer to mlx5_dev_ctx_shared object.
- *
- * @return
- *   The number of all validate pools.
- */
-static uint32_t
-mlx5_get_all_valid_pool_count(struct mlx5_dev_ctx_shared *sh)
-{
-	int i;
-	uint32_t pools_n = 0;
-
-	for (i = 0; i < MLX5_CCONT_TYPE_MAX; ++i)
-		pools_n += rte_atomic16_read(&sh->cmng.ccont[i].n_valid);
-	return pools_n;
-}
-
-/**
  * Set the periodic procedure for triggering asynchronous batch queries for all
  * the counter pools.
  *
@@ -6623,7 +6603,7 @@ struct mlx5_meter_domains_infos *
 {
 	uint32_t pools_n, us;
 
-	pools_n = mlx5_get_all_valid_pool_count(sh);
+	pools_n = rte_atomic16_read(&sh->cmng.n_valid);
 	us = MLX5_POOL_QUERY_FREQ_US / pools_n;
 	DRV_LOG(DEBUG, "Set alarm for %u pools each %u us", pools_n, us);
 	if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) {
@@ -6645,31 +6625,16 @@ struct mlx5_meter_domains_infos *
 mlx5_flow_query_alarm(void *arg)
 {
 	struct mlx5_dev_ctx_shared *sh = arg;
-	struct mlx5_devx_obj *dcs;
-	uint16_t offset;
 	int ret;
-	uint8_t batch = sh->cmng.batch;
 	uint16_t pool_index = sh->cmng.pool_index;
-	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
-	int cont_loop = MLX5_CCONT_TYPE_MAX;
 
 	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
 		goto set_alarm;
-next_container:
-	cont = MLX5_CNT_CONTAINER(sh, batch);
-	rte_spinlock_lock(&cont->resize_sl);
-	if (!cont->pools) {
-		rte_spinlock_unlock(&cont->resize_sl);
-		/* Check if all the containers are empty. */
-		if (unlikely(--cont_loop == 0))
-			goto set_alarm;
-		batch ^= 0x1;
-		pool_index = 0;
-		goto next_container;
-	}
-	pool = cont->pools[pool_index];
-	rte_spinlock_unlock(&cont->resize_sl);
+	rte_spinlock_lock(&cmng->resize_sl);
+	pool = cmng->pools[pool_index];
+	rte_spinlock_unlock(&cmng->resize_sl);
 	if (pool->raw_hw)
 		/* There is a pool query in progress. */
 		goto set_alarm;
@@ -6678,14 +6643,6 @@ struct mlx5_meter_domains_infos *
 	if (!pool->raw_hw)
 		/* No free counter statistics raw memory. */
 		goto set_alarm;
-	dcs = (struct mlx5_devx_obj *)(uintptr_t)rte_atomic64_read
-							      (&pool->a64_dcs);
-	if (dcs->id & (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1)) {
-		/* Pool without valid counter. */
-		pool->raw_hw = NULL;
-		goto next_pool;
-	}
-	offset = batch ? 0 : dcs->id % MLX5_COUNTERS_PER_POOL;
 	/*
 	 * Identify the counters released between query trigger and query
 	 * handle more efficiently. The counter released in this gap period
@@ -6693,11 +6650,12 @@ struct mlx5_meter_domains_infos *
 	 * will not be taken into account.
 	 */
 	pool->query_gen++;
-	ret = mlx5_devx_cmd_flow_counter_query(dcs, 0, MLX5_COUNTERS_PER_POOL -
-					       offset, NULL, NULL,
+	ret = mlx5_devx_cmd_flow_counter_query(pool->min_dcs, 0,
+					       MLX5_COUNTERS_PER_POOL,
+					       NULL, NULL,
 					       pool->raw_hw->mem_mng->dm->id,
 					       (void *)(uintptr_t)
-					       (pool->raw_hw->data + offset),
+					       pool->raw_hw->data,
 					       sh->devx_comp,
 					       (uint64_t)(uintptr_t)pool);
 	if (ret) {
@@ -6706,17 +6664,12 @@ struct mlx5_meter_domains_infos *
 		pool->raw_hw = NULL;
 		goto set_alarm;
 	}
-	pool->raw_hw->min_dcs_id = dcs->id;
 	LIST_REMOVE(pool->raw_hw, next);
 	sh->cmng.pending_queries++;
-next_pool:
 	pool_index++;
-	if (pool_index >= rte_atomic16_read(&cont->n_valid)) {
-		batch ^= 0x1;
+	if (pool_index >= rte_atomic16_read(&cmng->n_valid))
 		pool_index = 0;
-	}
 set_alarm:
-	sh->cmng.batch = batch;
 	sh->cmng.pool_index = pool_index;
 	mlx5_set_query_alarm(sh);
 }
@@ -6809,8 +6762,7 @@ struct mlx5_meter_domains_infos *
 		(struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
 	struct mlx5_counter_stats_raw *raw_to_free;
 	uint8_t query_gen = pool->query_gen ^ 1;
-	struct mlx5_pools_container *cont =
-		MLX5_CNT_CONTAINER(sh, !IS_EXT_POOL(pool));
+	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
 	enum mlx5_counter_type cnt_type =
 		IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
 				    MLX5_COUNTER_TYPE_ORIGIN;
@@ -6827,10 +6779,10 @@ struct mlx5_meter_domains_infos *
 		/* Be sure the new raw counters data is updated in memory. */
 		rte_io_wmb();
 		if (!TAILQ_EMPTY(&pool->counters[query_gen])) {
-			rte_spinlock_lock(&cont->csl);
-			TAILQ_CONCAT(&cont->counters[cnt_type],
+			rte_spinlock_lock(&cmng->csl[cnt_type]);
+			TAILQ_CONCAT(&cmng->counters[cnt_type],
 				     &pool->counters[query_gen], next);
-			rte_spinlock_unlock(&cont->csl);
+			rte_spinlock_unlock(&cmng->csl[cnt_type]);
 		}
 	}
 	LIST_INSERT_HEAD(&sh->cmng.free_stat_raws, raw_to_free, next);
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index ec6aa19..b4be476 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -1144,4 +1144,5 @@ int mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev,
 				    const struct rte_flow_attr *attr);
 int mlx5_flow_meter_flush(struct rte_eth_dev *dev,
 			  struct rte_mtr_error *error);
+int mlx5_flow_dv_discover_counter_offset_support(struct rte_eth_dev *dev);
 #endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index b16db1d..bd29140 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4603,19 +4603,13 @@ struct field_modify_info modify_tcp[] = {
 			   struct mlx5_flow_counter_pool **ppool)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
-	uint32_t batch = 0;
 
 	/* Decrease to original index and clear shared bit. */
 	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
-	if (IS_BATCH_CNT(idx)) {
-		idx -= MLX5_CNT_BATCH_OFFSET;
-		batch = 1;
-	}
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
-	MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cont->n);
-	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+	MLX5_ASSERT(idx / MLX5_COUNTERS_PER_POOL < cmng->n);
+	pool = cmng->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
 	if (ppool)
 		*ppool = pool;
@@ -4647,8 +4641,8 @@ struct field_modify_info modify_tcp[] = {
 /**
  * Get a pool by devx counter ID.
  *
- * @param[in] cont
- *   Pointer to the counter container.
+ * @param[in] cmng
+ *   Pointer to the counter management.
  * @param[in] id
  *   The counter devx ID.
  *
@@ -4656,25 +4650,25 @@ struct field_modify_info modify_tcp[] = {
  *   The counter pool pointer if exists, NULL otherwise,
  */
 static struct mlx5_flow_counter_pool *
-flow_dv_find_pool_by_id(struct mlx5_pools_container *cont, int id)
+flow_dv_find_pool_by_id(struct mlx5_flow_counter_mng *cmng, int id)
 {
 	uint32_t i;
 
 	/* Check last used pool. */
-	if (cont->last_pool_idx != POOL_IDX_INVALID &&
-	    flow_dv_is_counter_in_pool(cont->pools[cont->last_pool_idx], id))
-		return cont->pools[cont->last_pool_idx];
+	if (cmng->last_pool_idx != POOL_IDX_INVALID &&
+	    flow_dv_is_counter_in_pool(cmng->pools[cmng->last_pool_idx], id))
+		return cmng->pools[cmng->last_pool_idx];
 	/* ID out of range means no suitable pool in the container. */
-	if (id > cont->max_id || id < cont->min_id)
+	if (id > cmng->max_id || id < cmng->min_id)
 		return NULL;
 	/*
 	 * Find the pool from the end of the container, since mostly counter
 	 * ID is sequence increasing, and the last pool should be the needed
 	 * one.
 	 */
-	i = rte_atomic16_read(&cont->n_valid);
+	i = rte_atomic16_read(&cmng->n_valid);
 	while (i--) {
-		struct mlx5_flow_counter_pool *pool = cont->pools[i];
+		struct mlx5_flow_counter_pool *pool = cmng->pools[i];
 
 		if (flow_dv_is_counter_in_pool(pool, id))
 			return pool;
@@ -4764,20 +4758,18 @@ struct field_modify_info modify_tcp[] = {
  *
  * @param[in] dev
  *   Pointer to the Ethernet device structure.
- * @param[in] batch
- *   Whether the pool is for counter that was allocated by batch command.
  *
  * @return
  *   0 on success, otherwise negative errno value and rte_errno is set.
  */
 static int
-flow_dv_container_resize(struct rte_eth_dev *dev, uint32_t batch)
+flow_dv_container_resize(struct rte_eth_dev *dev)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
-	void *old_pools = cont->pools;
-	uint32_t resize = cont->n + MLX5_CNT_CONTAINER_RESIZE;
+	void *old_pools = cmng->pools;
+	uint32_t resize = cmng->n + MLX5_CNT_CONTAINER_RESIZE;
 	uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize;
 	void *pools = mlx5_malloc(MLX5_MEM_ZERO, mem_size, 0, SOCKET_ID_ANY);
 
@@ -4786,7 +4778,7 @@ struct field_modify_info modify_tcp[] = {
 		return -ENOMEM;
 	}
 	if (old_pools)
-		memcpy(pools, old_pools, cont->n *
+		memcpy(pools, old_pools, cmng->n *
 				       sizeof(struct mlx5_flow_counter_pool *));
 	/*
 	 * Fallback mode query the counter directly, no background query
@@ -4807,11 +4799,11 @@ struct field_modify_info modify_tcp[] = {
 					 MLX5_CNT_CONTAINER_RESIZE +
 					 i, next);
 	}
-	rte_spinlock_lock(&cont->resize_sl);
-	cont->n = resize;
-	cont->mem_mng = mem_mng;
-	cont->pools = pools;
-	rte_spinlock_unlock(&cont->resize_sl);
+	rte_spinlock_lock(&cmng->resize_sl);
+	cmng->n = resize;
+	cmng->mem_mng = mem_mng;
+	cmng->pools = pools;
+	rte_spinlock_unlock(&cmng->resize_sl);
 	if (old_pools)
 		mlx5_free(old_pools);
 	return 0;
@@ -4844,27 +4836,15 @@ struct field_modify_info modify_tcp[] = {
 
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (!IS_BATCH_CNT(counter)) {
+	if (priv->counter_fallback) {
 		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-		if (priv->counter_fallback)
-			return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
+		return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
 					0, pkts, bytes, 0, NULL, NULL, 0);
 	}
-
 	rte_spinlock_lock(&pool->sl);
-	/*
-	 * The single counters allocation may allocate smaller ID than the
-	 * current allocated in parallel to the host reading.
-	 * In this case the new counter values must be reported as 0.
-	 */
-	if (unlikely(cnt_ext && cnt_ext->dcs->id < pool->raw->min_dcs_id)) {
-		*pkts = 0;
-		*bytes = 0;
-	} else {
-		offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
-		*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
-		*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
-	}
+	offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
+	*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+	*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
 	rte_spinlock_unlock(&pool->sl);
 	return 0;
 }
@@ -4876,8 +4856,6 @@ struct field_modify_info modify_tcp[] = {
  *   Pointer to the Ethernet device structure.
  * @param[out] dcs
  *   The devX counter handle.
- * @param[in] batch
- *   Whether the pool is for counter that was allocated by batch command.
  * @param[in] age
  *   Whether the pool is for counter that was allocated for aging.
  * @param[in/out] cont_cur
@@ -4888,124 +4866,64 @@ struct field_modify_info modify_tcp[] = {
  */
 static struct mlx5_flow_counter_pool *
 flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs,
-		    uint32_t batch, uint32_t age)
+		    uint32_t age)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
-	int16_t n_valid = rte_atomic16_read(&cont->n_valid);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
+	int16_t n_valid = rte_atomic16_read(&cmng->n_valid);
+	uint32_t fallback = priv->counter_fallback;
 	uint32_t size = sizeof(*pool);
 
-	if (cont->n == n_valid && flow_dv_container_resize(dev, batch))
+	if (cmng->n == n_valid && flow_dv_container_resize(dev))
 		return NULL;
 	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
-	size += (batch ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
+	size += (!fallback ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
 	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE);
 	pool = mlx5_malloc(MLX5_MEM_ZERO, size, 0, SOCKET_ID_ANY);
 	if (!pool) {
 		rte_errno = ENOMEM;
 		return NULL;
 	}
-	pool->min_dcs = dcs;
-	if (!priv->counter_fallback)
-		pool->raw = cont->mem_mng->raws + n_valid %
+	if (!fallback) {
+		pool->min_dcs = dcs;
+		pool->raw = cmng->mem_mng->raws + n_valid %
 						      MLX5_CNT_CONTAINER_RESIZE;
+	}
 	pool->raw_hw = NULL;
 	pool->type = 0;
-	pool->type |= (batch ? 0 :  CNT_POOL_TYPE_EXT);
+	pool->type |= (!fallback ? 0 :  CNT_POOL_TYPE_EXT);
 	pool->type |= (!age ? 0 :  CNT_POOL_TYPE_AGE);
 	pool->query_gen = 0;
 	rte_spinlock_init(&pool->sl);
 	TAILQ_INIT(&pool->counters[0]);
 	TAILQ_INIT(&pool->counters[1]);
-	TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+	TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
 	pool->index = n_valid;
 	pool->time_of_last_age_check = MLX5_CURR_TIME_SEC;
-	cont->pools[n_valid] = pool;
-	if (!batch) {
+	cmng->pools[n_valid] = pool;
+	if (fallback) {
 		int base = RTE_ALIGN_FLOOR(dcs->id, MLX5_COUNTERS_PER_POOL);
 
-		if (base < cont->min_id)
-			cont->min_id = base;
-		if (base > cont->max_id)
-			cont->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
-		cont->last_pool_idx = pool->index;
+		if (base < cmng->min_id)
+			cmng->min_id = base;
+		if (base > cmng->max_id)
+			cmng->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
+		cmng->last_pool_idx = pool->index;
 	}
 	/* Pool initialization must be updated before host thread access. */
 	rte_io_wmb();
-	rte_atomic16_add(&cont->n_valid, 1);
+	rte_atomic16_add(&cmng->n_valid, 1);
 	return pool;
 }
 
 /**
- * Restore skipped counters in the pool.
- *
- * As counter pool query requires the first counter dcs
- * ID start with 4 alinged, if the pool counters with
- * min_dcs ID are not aligned with 4, the counters will
- * be skipped.
- * Once other min_dcs ID less than these skipped counter
- * dcs ID appears, the skipped counters will be safe to
- * use.
- * Should be called when min_dcs is updated.
- *
- * @param[in] pool
- *   Current counter pool.
- * @param[in] last_min_dcs
- *   Last min_dcs.
- */
-static void
-flow_dv_counter_restore(struct mlx5_flow_counter_pool *pool,
-			struct mlx5_devx_obj *last_min_dcs)
-{
-	struct mlx5_flow_counter_ext *cnt_ext;
-	uint32_t offset, new_offset;
-	uint32_t skip_cnt = 0;
-	uint32_t i;
-
-	if (!pool->skip_cnt)
-		return;
-	/*
-	 * If last min_dcs is not valid. The skipped counter may even after
-	 * last min_dcs, set the offset to the whole pool.
-	 */
-	if (last_min_dcs->id & (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1))
-		offset = MLX5_COUNTERS_PER_POOL;
-	else
-		offset = last_min_dcs->id % MLX5_COUNTERS_PER_POOL;
-	new_offset = pool->min_dcs->id % MLX5_COUNTERS_PER_POOL;
-	/*
-	 * Check the counters from 1 to the last_min_dcs range. Counters
-	 * before new min_dcs indicates pool still has skipped counters.
-	 * Counters be skipped after new min_dcs will be ready to use.
-	 * Offset 0 counter must be empty or min_dcs, start from 1.
-	 */
-	for (i = 1; i < offset; i++) {
-		cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
-		if (cnt_ext->skipped) {
-			if (i > new_offset) {
-				cnt_ext->skipped = 0;
-				TAILQ_INSERT_TAIL
-					(&pool->counters[pool->query_gen],
-					 MLX5_POOL_GET_CNT(pool, i), next);
-			} else {
-				skip_cnt++;
-			}
-		}
-	}
-	if (!skip_cnt)
-		pool->skip_cnt = 0;
-}
-
-/**
  * Prepare a new counter and/or a new counter pool.
  *
  * @param[in] dev
  *   Pointer to the Ethernet device structure.
  * @param[out] cnt_free
  *   Where to put the pointer of a new counter.
- * @param[in] batch
- *   Whether the pool is for counter that was allocated by batch command.
  * @param[in] age
  *   Whether the pool is for counter that was allocated for aging.
  *
@@ -5016,98 +4934,45 @@ struct field_modify_info modify_tcp[] = {
 static struct mlx5_flow_counter_pool *
 flow_dv_counter_pool_prepare(struct rte_eth_dev *dev,
 			     struct mlx5_flow_counter **cnt_free,
-			     uint32_t batch, uint32_t age)
+			     uint32_t age)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont;
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_counters tmp_tq;
-	struct mlx5_devx_obj *last_min_dcs;
 	struct mlx5_devx_obj *dcs = NULL;
 	struct mlx5_flow_counter *cnt;
 	enum mlx5_counter_type cnt_type =
 			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
-	uint32_t add2other;
+	uint32_t fallback = priv->counter_fallback;
 	uint32_t i;
 
-	cont = MLX5_CNT_CONTAINER(priv->sh, batch);
-	if (!batch) {
-retry:
-		add2other = 0;
+	if (fallback) {
 		/* bulk_bitmap must be 0 for single counter allocation. */
 		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0);
 		if (!dcs)
 			return NULL;
-		pool = flow_dv_find_pool_by_id(cont, dcs->id);
-		/*
-		 * If pool eixsts but with other type, counter will be added
-		 * to the other pool, need to reallocate new counter in the
-		 * ragne with same type later.
-		 */
+		pool = flow_dv_find_pool_by_id(cmng, dcs->id);
 		if (!pool) {
-			pool = flow_dv_pool_create(dev, dcs, batch,
-						   age);
+			pool = flow_dv_pool_create(dev, dcs, age);
 			if (!pool) {
 				mlx5_devx_cmd_destroy(dcs);
 				return NULL;
 			}
-		} else if ((!!IS_AGE_POOL(pool)) != age) {
-			add2other = 1;
-		}
-		if ((dcs->id < pool->min_dcs->id ||
-		    pool->min_dcs->id &
-		    (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1)) &&
-		    !(dcs->id & (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1))) {
-			/*
-			 * Update the pool min_dcs only if current dcs is
-			 * valid and exist min_dcs is not valid or greater
-			 * than new dcs.
-			 */
-			last_min_dcs = pool->min_dcs;
-			rte_atomic64_set(&pool->a64_dcs,
-					 (int64_t)(uintptr_t)dcs);
-			/*
-			 * Restore any skipped counters if the new min_dcs
-			 * ID is smaller or min_dcs is not valid.
-			 */
-			if (dcs->id < last_min_dcs->id ||
-			    last_min_dcs->id &
-			    (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1))
-				flow_dv_counter_restore(pool, last_min_dcs);
 		}
 		i = dcs->id % MLX5_COUNTERS_PER_POOL;
 		cnt = MLX5_POOL_GET_CNT(pool, i);
 		cnt->pool = pool;
 		MLX5_GET_POOL_CNT_EXT(pool, i)->dcs = dcs;
-		/*
-		 * If min_dcs is not valid, it means the new allocated dcs
-		 * also fail to become the valid min_dcs, just skip it.
-		 * Or if min_dcs is valid, and new dcs ID is smaller than
-		 * min_dcs, but not become the min_dcs, also skip it.
-		 */
-		if (pool->min_dcs->id &
-		    (MLX5_CNT_BATCH_QUERY_ID_ALIGNMENT - 1) ||
-		    dcs->id < pool->min_dcs->id) {
-			MLX5_GET_POOL_CNT_EXT(pool, i)->skipped = 1;
-			pool->skip_cnt = 1;
-			goto retry;
-		}
-		if (add2other) {
-			TAILQ_INSERT_TAIL(&pool->counters[pool->query_gen],
-					  cnt, next);
-			goto retry;
-		}
 		*cnt_free = cnt;
 		return pool;
 	}
-	/* bulk_bitmap is in 128 counters units. */
-	if (priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4)
-		dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0x4);
+	dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0x4);
 	if (!dcs) {
 		rte_errno = ENODATA;
 		return NULL;
 	}
-	pool = flow_dv_pool_create(dev, dcs, batch, age);
+	pool = flow_dv_pool_create(dev, dcs, age);
 	if (!pool) {
 		mlx5_devx_cmd_destroy(dcs);
 		return NULL;
@@ -5118,9 +4983,9 @@ struct field_modify_info modify_tcp[] = {
 		cnt->pool = pool;
 		TAILQ_INSERT_HEAD(&tmp_tq, cnt, next);
 	}
-	rte_spinlock_lock(&cont->csl);
-	TAILQ_CONCAT(&cont->counters[cnt_type], &tmp_tq, next);
-	rte_spinlock_unlock(&cont->csl);
+	rte_spinlock_lock(&cmng->csl[cnt_type]);
+	TAILQ_CONCAT(&cmng->counters[cnt_type], &tmp_tq, next);
+	rte_spinlock_unlock(&cmng->csl[cnt_type]);
 	*cnt_free = MLX5_POOL_GET_CNT(pool, 0);
 	(*cnt_free)->pool = pool;
 	return pool;
@@ -5157,8 +5022,6 @@ struct field_modify_info modify_tcp[] = {
  *   Indicate if this counter is shared with other flows.
  * @param[in] id
  *   Counter identifier.
- * @param[in] group
- *   Counter flow group.
  * @param[in] age
  *   Whether the counter was allocated for aging.
  *
@@ -5167,22 +5030,14 @@ struct field_modify_info modify_tcp[] = {
  */
 static uint32_t
 flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id,
-		      uint16_t group, uint32_t age)
+		      uint32_t age)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt_free = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
-	/*
-	 * Currently group 0 flow counter cannot be assigned to a flow if it is
-	 * not the first one in the batch counter allocation, so it is better
-	 * to allocate counters one by one for these flows in a separate
-	 * container.
-	 * A counter can be shared between different groups so need to take
-	 * shared counters from the single container.
-	 */
-	uint32_t batch = (group && !shared && !priv->counter_fallback) ? 1 : 0;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, batch);
+	uint32_t fallback = priv->counter_fallback;
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	enum mlx5_counter_type cnt_type =
 			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
 	uint32_t cnt_idx;
@@ -5205,16 +5060,15 @@ struct field_modify_info modify_tcp[] = {
 		}
 	}
 	/* Get free counters from container. */
-	rte_spinlock_lock(&cont->csl);
-	cnt_free = TAILQ_FIRST(&cont->counters[cnt_type]);
+	rte_spinlock_lock(&cmng->csl[cnt_type]);
+	cnt_free = TAILQ_FIRST(&cmng->counters[cnt_type]);
 	if (cnt_free)
-		TAILQ_REMOVE(&cont->counters[cnt_type], cnt_free, next);
-	rte_spinlock_unlock(&cont->csl);
-	if (!cnt_free && !flow_dv_counter_pool_prepare(dev, &cnt_free,
-						       batch, age))
+		TAILQ_REMOVE(&cmng->counters[cnt_type], cnt_free, next);
+	rte_spinlock_unlock(&cmng->csl[cnt_type]);
+	if (!cnt_free && !flow_dv_counter_pool_prepare(dev, &cnt_free, age))
 		goto err;
 	pool = cnt_free->pool;
-	if (!batch)
+	if (fallback)
 		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt_free);
 	/* Create a DV counter action only in the first time usage. */
 	if (!cnt_free->action) {
@@ -5222,7 +5076,7 @@ struct field_modify_info modify_tcp[] = {
 		struct mlx5_devx_obj *dcs;
 		int ret;
 
-		if (batch) {
+		if (!fallback) {
 			offset = MLX5_CNT_ARRAY_IDX(pool, cnt_free);
 			dcs = pool->min_dcs;
 		} else {
@@ -5238,7 +5092,6 @@ struct field_modify_info modify_tcp[] = {
 	}
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool->index,
 				MLX5_CNT_ARRAY_IDX(pool, cnt_free));
-	cnt_idx += batch * MLX5_CNT_BATCH_OFFSET;
 	/* Update the counter reset values. */
 	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
 				 &cnt_free->bytes))
@@ -5253,16 +5106,16 @@ struct field_modify_info modify_tcp[] = {
 		cnt_free->shared_info.id = id;
 		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
-	if (!priv->counter_fallback && !priv->sh->cmng.query_thread_on)
+	if (!fallback && !priv->sh->cmng.query_thread_on)
 		/* Start the asynchronous batch query by the host thread. */
 		mlx5_set_query_alarm(priv->sh);
 	return cnt_idx;
 err:
 	if (cnt_free) {
 		cnt_free->pool = pool;
-		rte_spinlock_lock(&cont->csl);
-		TAILQ_INSERT_TAIL(&cont->counters[cnt_type], cnt_free, next);
-		rte_spinlock_unlock(&cont->csl);
+		rte_spinlock_lock(&cmng->csl[cnt_type]);
+		TAILQ_INSERT_TAIL(&cmng->counters[cnt_type], cnt_free, next);
+		rte_spinlock_unlock(&cmng->csl[cnt_type]);
 	}
 	return 0;
 }
@@ -5346,7 +5199,6 @@ struct field_modify_info modify_tcp[] = {
 		return;
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-
 	if (IS_SHARED_CNT(counter)) {
 		if (--cnt->shared_info.ref_cnt)
 			return;
@@ -5371,8 +5223,7 @@ struct field_modify_info modify_tcp[] = {
 	} else {
 		cnt_type = IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
 					       MLX5_COUNTER_TYPE_ORIGIN;
-		TAILQ_INSERT_TAIL(&((MLX5_CNT_CONTAINER
-				  (priv->sh, 0))->counters[cnt_type]),
+		TAILQ_INSERT_TAIL(&priv->sh->cmng.counters[cnt_type],
 				  cnt, next);
 	}
 }
@@ -8507,8 +8358,7 @@ struct field_modify_info modify_tcp[] = {
 
 	counter = flow_dv_counter_alloc(dev,
 				count ? count->shared : 0,
-				count ? count->id : 0,
-				dev_flow->dv.group, !!age);
+				count ? count->id : 0, !!age);
 	if (!counter || age == NULL)
 		return counter;
 	age_param  = flow_dv_counter_idx_get_age(dev, counter);
@@ -11419,6 +11269,103 @@ struct field_modify_info modify_tcp[] = {
 }
 
 /**
+ * Validate the batch counter support in root table.
+ *
+ * Create a simple flow with invalid counter and drop action on root table to
+ * validate if batch counter with offset on root table is supported or not.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_dv_discover_counter_offset_support(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_flow_dv_match_params mask = {
+		.size = sizeof(mask.buf),
+	};
+	struct mlx5_flow_dv_match_params value = {
+		.size = sizeof(value.buf),
+	};
+	struct mlx5dv_flow_matcher_attr dv_attr = {
+		.type = IBV_FLOW_ATTR_NORMAL,
+		.priority = 0,
+		.match_criteria_enable = 0,
+		.match_mask = (void *)&mask,
+	};
+	void *actions[2] = { 0 };
+	struct mlx5_flow_tbl_resource *tbl = NULL, *dest_tbl = NULL;
+	struct mlx5_devx_obj *dcs = NULL;
+	void *matcher = NULL;
+	void *flow = NULL;
+	int i, ret = -1;
+
+	tbl = flow_dv_tbl_resource_get(dev, 0, 0, 0, NULL);
+	if (!tbl)
+		goto err;
+	dest_tbl = flow_dv_tbl_resource_get(dev, 1, 0, 0, NULL);
+	if (!dest_tbl)
+		goto err;
+	dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0x4);
+	if (!dcs)
+		goto err;
+	ret = mlx5_flow_os_create_flow_action_count(dcs->obj, UINT16_MAX,
+						    &actions[0]);
+	if (ret)
+		goto err;
+	ret = mlx5_flow_os_create_flow_action_dest_flow_tbl
+				(dest_tbl->obj, &actions[1]);
+	if (ret)
+		goto err;
+	dv_attr.match_criteria_enable = flow_dv_matcher_enable(mask.buf);
+	ret = mlx5_flow_os_create_flow_matcher(sh->ctx, &dv_attr, tbl->obj,
+					       &matcher);
+	if (ret)
+		goto err;
+	ret = mlx5_flow_os_create_flow(matcher, (void *)&value, 2,
+				       actions, &flow);
+err:
+	/*
+	 * If batch counter with offset is not supported, the driver will not
+	 * validate the invalid offset value, flow create should success.
+	 * In this case, it means batch counter is not supported in root table.
+	 *
+	 * Otherwise, if flow create is failed, counter offset is supported.
+	 */
+	if (flow) {
+		DRV_LOG(INFO, "Batch counter is not supported in root "
+			      "table. Switch to fallback mode.");
+		rte_errno = ENOTSUP;
+		ret = -rte_errno;
+		claim_zero(mlx5_flow_os_destroy_flow(flow));
+	} else {
+		/* Check matcher to make sure validate fail at flow create. */
+		if (!matcher || (matcher && errno != EINVAL))
+			DRV_LOG(ERR, "Unexpected error in counter offset "
+				     "support detection");
+		ret = 0;
+	}
+	for (i = 0; i < 2; i++) {
+		if (actions[i])
+			claim_zero(mlx5_flow_os_destroy_flow_action
+				   (actions[i]));
+	}
+	if (matcher)
+		claim_zero(mlx5_flow_os_destroy_flow_matcher(matcher));
+	if (tbl)
+		flow_dv_tbl_resource_release(dev, tbl);
+	if (dest_tbl)
+		flow_dv_tbl_resource_release(dev, dest_tbl);
+	if (dcs)
+		claim_zero(mlx5_devx_cmd_destroy(dcs));
+	return ret;
+}
+
+/**
  * Query a devx counter.
  *
  * @param[in] dev
@@ -11580,7 +11527,7 @@ struct field_modify_info modify_tcp[] = {
 	uint32_t cnt;
 
 	flow_dv_shared_lock(dev);
-	cnt = flow_dv_counter_alloc(dev, 0, 0, 1, 0);
+	cnt = flow_dv_counter_alloc(dev, 0, 0, 0);
 	flow_dv_shared_unlock(dev);
 	return cnt;
 }
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index bda55c2..bd2a734 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -159,11 +159,11 @@
 			      struct mlx5_flow_counter_pool **ppool)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
 
 	idx = (idx - 1) & (MLX5_CNT_SHARED_OFFSET - 1);
-	pool = cont->pools[idx / MLX5_COUNTERS_PER_POOL];
+	pool = cmng->pools[idx / MLX5_COUNTERS_PER_POOL];
 	MLX5_ASSERT(pool);
 	if (ppool)
 		*ppool = pool;
@@ -254,12 +254,12 @@
 flow_verbs_counter_new(struct rte_eth_dev *dev, uint32_t shared, uint32_t id)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_pools_container *cont = MLX5_CNT_CONTAINER(priv->sh, 0);
+	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
 	union mlx5_l3t_data data;
-	uint32_t n_valid = rte_atomic16_read(&cont->n_valid);
+	uint32_t n_valid = rte_atomic16_read(&cmng->n_valid);
 	uint32_t pool_idx, cnt_idx;
 	uint32_t i;
 	int ret;
@@ -275,7 +275,7 @@
 		return data.dword;
 	}
 	for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
-		pool = cont->pools[pool_idx];
+		pool = cmng->pools[pool_idx];
 		if (!pool)
 			continue;
 		cnt = TAILQ_FIRST(&pool->counters[0]);
@@ -286,7 +286,7 @@
 		struct mlx5_flow_counter_pool **pools;
 		uint32_t size;
 
-		if (n_valid == cont->n) {
+		if (n_valid == cmng->n) {
 			/* Resize the container pool array. */
 			size = sizeof(struct mlx5_flow_counter_pool *) *
 				     (n_valid + MLX5_CNT_CONTAINER_RESIZE);
@@ -295,13 +295,13 @@
 			if (!pools)
 				return 0;
 			if (n_valid) {
-				memcpy(pools, cont->pools,
+				memcpy(pools, cmng->pools,
 				       sizeof(struct mlx5_flow_counter_pool *) *
 				       n_valid);
-				mlx5_free(cont->pools);
+				mlx5_free(cmng->pools);
 			}
-			cont->pools = pools;
-			cont->n += MLX5_CNT_CONTAINER_RESIZE;
+			cmng->pools = pools;
+			cmng->n += MLX5_CNT_CONTAINER_RESIZE;
 		}
 		/* Allocate memory for new pool*/
 		size = sizeof(*pool) + (sizeof(*cnt_ext) + sizeof(*cnt)) *
@@ -315,10 +315,10 @@
 			TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
 		}
 		cnt = MLX5_POOL_GET_CNT(pool, 0);
-		cont->pools[n_valid] = pool;
+		cmng->pools[n_valid] = pool;
 		pool_idx = n_valid;
-		rte_atomic16_add(&cont->n_valid, 1);
-		TAILQ_INSERT_HEAD(&cont->pool_list, pool, next);
+		rte_atomic16_add(&cmng->n_valid, 1);
+		TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
 	}
 	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool_idx, i);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 4/8] net/mlx5: synchronize flow counter pool creation
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
                     ` (2 preceding siblings ...)
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 3/8] net/mlx5: remove single counter container Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 5/8] net/mlx5: make three level table thread safe Suanming Mou
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

Currently, counter operations are not thread safe as the counter
pools' array resize is not protected.

This commit protects the container pools' array resize using a spinlock.
The original counter pool statistic memory allocate is moved to the
host thread in order to minimize the critical section. Since that pool
statistic memory is required only in query time. The container pools'
array should be resized by the user threads, the new pool may be used
by other rte_flow APIs before the host thread resize is done, if the
pool is not saved to the pools' array, the specified counter memory will
not be found as the pool is not saved to the counter management pool
array. The pool raw statistic memory will be filled in host thread.

The shared counters will be protected in other commit.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c   |   5 ++
 drivers/net/mlx5/mlx5.c            |  12 ++-
 drivers/net/mlx5/mlx5.h            |  10 ++-
 drivers/net/mlx5/mlx5_flow.c       | 120 +++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_flow_dv.c    | 171 ++++++++++---------------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |   5 +-
 6 files changed, 177 insertions(+), 146 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index fbd95e7..6e33b2b 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1067,6 +1067,11 @@
 			err = -err;
 			goto error;
 		}
+		/* Check relax ordering support. */
+		if (config->hca_attr.relaxed_ordering_write &&
+		    config->hca_attr.relaxed_ordering_read  &&
+		    !haswell_broadwell_cpu)
+			sh->cmng.relaxed_ordering = 1;
 		/* Check for LRO support. */
 		if (config->dest_tir && config->hca_attr.lro_cap &&
 		    config->dv_flow_en) {
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 4d1ca9a..e805723 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -487,8 +487,7 @@ struct mlx5_flow_id_pool *
 	sh->cmng.min_id = MLX5_CNT_BATCH_OFFSET;
 	sh->cmng.max_id = -1;
 	sh->cmng.last_pool_idx = POOL_IDX_INVALID;
-	TAILQ_INIT(&sh->cmng.pool_list);
-	rte_spinlock_init(&sh->cmng.resize_sl);
+	rte_spinlock_init(&sh->cmng.pool_update_sl);
 	for (i = 0; i < MLX5_COUNTER_TYPE_MAX; i++) {
 		TAILQ_INIT(&sh->cmng.counters[i]);
 		rte_spinlock_init(&sh->cmng.csl[i]);
@@ -522,7 +521,7 @@ struct mlx5_flow_id_pool *
 mlx5_flow_counters_mng_close(struct mlx5_dev_ctx_shared *sh)
 {
 	struct mlx5_counter_stats_mem_mng *mng;
-	int j;
+	int i, j;
 	int retries = 1024;
 
 	rte_errno = 0;
@@ -535,9 +534,10 @@ struct mlx5_flow_id_pool *
 
 	if (sh->cmng.pools) {
 		struct mlx5_flow_counter_pool *pool;
+		uint16_t n_valid = sh->cmng.n_valid;
 
-		pool = TAILQ_FIRST(&sh->cmng.pool_list);
-		while (pool) {
+		for (i = 0; i < n_valid; ++i) {
+			pool = sh->cmng.pools[i];
 			if (!IS_EXT_POOL(pool) && pool->min_dcs)
 				claim_zero(mlx5_devx_cmd_destroy
 							       (pool->min_dcs));
@@ -553,9 +553,7 @@ struct mlx5_flow_id_pool *
 						   (MLX5_GET_POOL_CNT_EXT
 						    (pool, j)->dcs));
 			}
-			TAILQ_REMOVE(&sh->cmng.pool_list, pool, next);
 			mlx5_free(pool);
-			pool = TAILQ_FIRST(&sh->cmng.pool_list);
 		}
 		mlx5_free(sh->cmng.pools);
 	}
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 78cdac3..e314668 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -406,8 +406,10 @@ struct mlx5_flow_counter_pool {
 	uint32_t type:2; /* Memory type behind the counter array. */
 	volatile uint32_t query_gen:1; /* Query round. */
 	rte_spinlock_t sl; /* The pool lock. */
+	rte_spinlock_t csl; /* The pool counter free list lock. */
 	struct mlx5_counter_stats_raw *raw;
-	struct mlx5_counter_stats_raw *raw_hw; /* The raw on HW working. */
+	struct mlx5_counter_stats_raw *raw_hw;
+	/* The raw on HW working. */
 };
 
 /* Memory management structure for group of counter statistics raws. */
@@ -429,17 +431,16 @@ struct mlx5_counter_stats_raw {
 
 /* Counter global management structure. */
 struct mlx5_flow_counter_mng {
-	rte_atomic16_t n_valid; /* Number of valid pools. */
+	volatile uint16_t n_valid; /* Number of valid pools. */
 	uint16_t n; /* Number of pools. */
 	uint16_t last_pool_idx; /* Last used pool index */
 	int min_id; /* The minimum counter ID in the pools. */
 	int max_id; /* The maximum counter ID in the pools. */
-	rte_spinlock_t resize_sl; /* The resize lock. */
+	rte_spinlock_t pool_update_sl; /* The pool update lock. */
 	rte_spinlock_t csl[MLX5_COUNTER_TYPE_MAX];
 	/* The counter free list lock. */
 	struct mlx5_counters counters[MLX5_COUNTER_TYPE_MAX];
 	/* Free counter list. */
-	struct mlx5_counter_pools pool_list; /* Counter pool list. */
 	struct mlx5_flow_counter_pool **pools; /* Counter pool array. */
 	struct mlx5_counter_stats_mem_mng *mem_mng;
 	/* Hold the memory management for the next allocated pools raws. */
@@ -447,6 +448,7 @@ struct mlx5_flow_counter_mng {
 	uint8_t pending_queries;
 	uint16_t pool_index;
 	uint8_t query_thread_on;
+	bool relaxed_ordering;
 	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
 	LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
 };
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index dae7ac3..c79d02e 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -13,6 +13,7 @@
 #include <rte_common.h>
 #include <rte_ether.h>
 #include <rte_ethdev_driver.h>
+#include <rte_eal_paging.h>
 #include <rte_flow.h>
 #include <rte_cycles.h>
 #include <rte_flow_driver.h>
@@ -29,6 +30,7 @@
 #include "mlx5_flow.h"
 #include "mlx5_flow_os.h"
 #include "mlx5_rxtx.h"
+#include "mlx5_common_os.h"
 
 /** Device flow drivers. */
 extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops;
@@ -6589,6 +6591,111 @@ struct mlx5_meter_domains_infos *
 	return -ENOTSUP;
 }
 
+/**
+ * Allocate a new memory for the counter values wrapped by all the needed
+ * management.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_dev_ctx_shared object.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_flow_create_counter_stat_mem_mng(struct mlx5_dev_ctx_shared *sh)
+{
+	struct mlx5_devx_mkey_attr mkey_attr;
+	struct mlx5_counter_stats_mem_mng *mem_mng;
+	volatile struct flow_counter_stats *raw_data;
+	int raws_n = MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES;
+	int size = (sizeof(struct flow_counter_stats) *
+			MLX5_COUNTERS_PER_POOL +
+			sizeof(struct mlx5_counter_stats_raw)) * raws_n +
+			sizeof(struct mlx5_counter_stats_mem_mng);
+	size_t pgsize = rte_mem_page_size();
+	uint8_t *mem;
+	int i;
+
+	if (pgsize == (size_t)-1) {
+		DRV_LOG(ERR, "Failed to get mem page size");
+		rte_errno = ENOMEM;
+		return -ENOMEM;
+	}
+	mem = mlx5_malloc(MLX5_MEM_ZERO, size, pgsize, SOCKET_ID_ANY);
+	if (!mem) {
+		rte_errno = ENOMEM;
+		return -ENOMEM;
+	}
+	mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1;
+	size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n;
+	mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size,
+						 IBV_ACCESS_LOCAL_WRITE);
+	if (!mem_mng->umem) {
+		rte_errno = errno;
+		mlx5_free(mem);
+		return -rte_errno;
+	}
+	mkey_attr.addr = (uintptr_t)mem;
+	mkey_attr.size = size;
+	mkey_attr.umem_id = mlx5_os_get_umem_id(mem_mng->umem);
+	mkey_attr.pd = sh->pdn;
+	mkey_attr.log_entity_size = 0;
+	mkey_attr.pg_access = 0;
+	mkey_attr.klm_array = NULL;
+	mkey_attr.klm_num = 0;
+	mkey_attr.relaxed_ordering = sh->cmng.relaxed_ordering;
+	mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr);
+	if (!mem_mng->dm) {
+		mlx5_glue->devx_umem_dereg(mem_mng->umem);
+		rte_errno = errno;
+		mlx5_free(mem);
+		return -rte_errno;
+	}
+	mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size);
+	raw_data = (volatile struct flow_counter_stats *)mem;
+	for (i = 0; i < raws_n; ++i) {
+		mem_mng->raws[i].mem_mng = mem_mng;
+		mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL;
+	}
+	for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i)
+		LIST_INSERT_HEAD(&sh->cmng.free_stat_raws,
+				 mem_mng->raws + MLX5_CNT_CONTAINER_RESIZE + i,
+				 next);
+	LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next);
+	sh->cmng.mem_mng = mem_mng;
+	return 0;
+}
+
+/**
+ * Set the statistic memory to the new counter pool.
+ *
+ * @param[in] sh
+ *   Pointer to mlx5_dev_ctx_shared object.
+ * @param[in] pool
+ *   Pointer to the pool to set the statistic memory.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_flow_set_counter_stat_mem(struct mlx5_dev_ctx_shared *sh,
+			       struct mlx5_flow_counter_pool *pool)
+{
+	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
+	/* Resize statistic memory once used out. */
+	if (!(pool->index % MLX5_CNT_CONTAINER_RESIZE) &&
+	    mlx5_flow_create_counter_stat_mem_mng(sh)) {
+		DRV_LOG(ERR, "Cannot resize counter stat mem.");
+		return -1;
+	}
+	rte_spinlock_lock(&pool->sl);
+	pool->raw = cmng->mem_mng->raws + pool->index %
+		    MLX5_CNT_CONTAINER_RESIZE;
+	rte_spinlock_unlock(&pool->sl);
+	pool->raw_hw = NULL;
+	return 0;
+}
+
 #define MLX5_POOL_QUERY_FREQ_US 1000000
 
 /**
@@ -6603,7 +6710,7 @@ struct mlx5_meter_domains_infos *
 {
 	uint32_t pools_n, us;
 
-	pools_n = rte_atomic16_read(&sh->cmng.n_valid);
+	pools_n = __atomic_load_n(&sh->cmng.n_valid, __ATOMIC_RELAXED);
 	us = MLX5_POOL_QUERY_FREQ_US / pools_n;
 	DRV_LOG(DEBUG, "Set alarm for %u pools each %u us", pools_n, us);
 	if (rte_eal_alarm_set(us, mlx5_flow_query_alarm, sh)) {
@@ -6629,12 +6736,17 @@ struct mlx5_meter_domains_infos *
 	uint16_t pool_index = sh->cmng.pool_index;
 	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
 	struct mlx5_flow_counter_pool *pool;
+	uint16_t n_valid;
 
 	if (sh->cmng.pending_queries >= MLX5_MAX_PENDING_QUERIES)
 		goto set_alarm;
-	rte_spinlock_lock(&cmng->resize_sl);
+	rte_spinlock_lock(&cmng->pool_update_sl);
 	pool = cmng->pools[pool_index];
-	rte_spinlock_unlock(&cmng->resize_sl);
+	n_valid = cmng->n_valid;
+	rte_spinlock_unlock(&cmng->pool_update_sl);
+	/* Set the statistic memory to the new created pool. */
+	if ((!pool->raw && mlx5_flow_set_counter_stat_mem(sh, pool)))
+		goto set_alarm;
 	if (pool->raw_hw)
 		/* There is a pool query in progress. */
 		goto set_alarm;
@@ -6667,7 +6779,7 @@ struct mlx5_meter_domains_infos *
 	LIST_REMOVE(pool->raw_hw, next);
 	sh->cmng.pending_queries++;
 	pool_index++;
-	if (pool_index >= rte_atomic16_read(&cmng->n_valid))
+	if (pool_index >= n_valid)
 		pool_index = 0;
 set_alarm:
 	sh->cmng.pool_index = pool_index;
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index bd29140..6226d87 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4653,104 +4653,35 @@ struct field_modify_info modify_tcp[] = {
 flow_dv_find_pool_by_id(struct mlx5_flow_counter_mng *cmng, int id)
 {
 	uint32_t i;
+	struct mlx5_flow_counter_pool *pool = NULL;
 
+	rte_spinlock_lock(&cmng->pool_update_sl);
 	/* Check last used pool. */
 	if (cmng->last_pool_idx != POOL_IDX_INVALID &&
-	    flow_dv_is_counter_in_pool(cmng->pools[cmng->last_pool_idx], id))
-		return cmng->pools[cmng->last_pool_idx];
+	    flow_dv_is_counter_in_pool(cmng->pools[cmng->last_pool_idx], id)) {
+		pool = cmng->pools[cmng->last_pool_idx];
+		goto out;
+	}
 	/* ID out of range means no suitable pool in the container. */
 	if (id > cmng->max_id || id < cmng->min_id)
-		return NULL;
+		goto out;
 	/*
 	 * Find the pool from the end of the container, since mostly counter
 	 * ID is sequence increasing, and the last pool should be the needed
 	 * one.
 	 */
-	i = rte_atomic16_read(&cmng->n_valid);
+	i = cmng->n_valid;
 	while (i--) {
-		struct mlx5_flow_counter_pool *pool = cmng->pools[i];
-
-		if (flow_dv_is_counter_in_pool(pool, id))
-			return pool;
-	}
-	return NULL;
-}
+		struct mlx5_flow_counter_pool *pool_tmp = cmng->pools[i];
 
-/**
- * Allocate a new memory for the counter values wrapped by all the needed
- * management.
- *
- * @param[in] dev
- *   Pointer to the Ethernet device structure.
- * @param[in] raws_n
- *   The raw memory areas - each one for MLX5_COUNTERS_PER_POOL counters.
- *
- * @return
- *   The new memory management pointer on success, otherwise NULL and rte_errno
- *   is set.
- */
-static struct mlx5_counter_stats_mem_mng *
-flow_dv_create_counter_stat_mem_mng(struct rte_eth_dev *dev, int raws_n)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_dev_ctx_shared *sh = priv->sh;
-	struct mlx5_devx_mkey_attr mkey_attr;
-	struct mlx5_counter_stats_mem_mng *mem_mng;
-	volatile struct flow_counter_stats *raw_data;
-	int size = (sizeof(struct flow_counter_stats) *
-			MLX5_COUNTERS_PER_POOL +
-			sizeof(struct mlx5_counter_stats_raw)) * raws_n +
-			sizeof(struct mlx5_counter_stats_mem_mng);
-	size_t pgsize = rte_mem_page_size();
-	if (pgsize == (size_t)-1) {
-		DRV_LOG(ERR, "Failed to get mem page size");
-		rte_errno = ENOMEM;
-		return NULL;
-	}
-	uint8_t *mem = mlx5_malloc(MLX5_MEM_ZERO, size, pgsize,
-				  SOCKET_ID_ANY);
-	int i;
-
-	if (!mem) {
-		rte_errno = ENOMEM;
-		return NULL;
-	}
-	mem_mng = (struct mlx5_counter_stats_mem_mng *)(mem + size) - 1;
-	size = sizeof(*raw_data) * MLX5_COUNTERS_PER_POOL * raws_n;
-	mem_mng->umem = mlx5_glue->devx_umem_reg(sh->ctx, mem, size,
-						 IBV_ACCESS_LOCAL_WRITE);
-	if (!mem_mng->umem) {
-		rte_errno = errno;
-		mlx5_free(mem);
-		return NULL;
-	}
-	mkey_attr.addr = (uintptr_t)mem;
-	mkey_attr.size = size;
-	mkey_attr.umem_id = mlx5_os_get_umem_id(mem_mng->umem);
-	mkey_attr.pd = sh->pdn;
-	mkey_attr.log_entity_size = 0;
-	mkey_attr.pg_access = 0;
-	mkey_attr.klm_array = NULL;
-	mkey_attr.klm_num = 0;
-	if (priv->config.hca_attr.relaxed_ordering_write &&
-		priv->config.hca_attr.relaxed_ordering_read  &&
-		!haswell_broadwell_cpu)
-		mkey_attr.relaxed_ordering = 1;
-	mem_mng->dm = mlx5_devx_cmd_mkey_create(sh->ctx, &mkey_attr);
-	if (!mem_mng->dm) {
-		mlx5_glue->devx_umem_dereg(mem_mng->umem);
-		rte_errno = errno;
-		mlx5_free(mem);
-		return NULL;
-	}
-	mem_mng->raws = (struct mlx5_counter_stats_raw *)(mem + size);
-	raw_data = (volatile struct flow_counter_stats *)mem;
-	for (i = 0; i < raws_n; ++i) {
-		mem_mng->raws[i].mem_mng = mem_mng;
-		mem_mng->raws[i].data = raw_data + i * MLX5_COUNTERS_PER_POOL;
+		if (flow_dv_is_counter_in_pool(pool_tmp, id)) {
+			pool = pool_tmp;
+			break;
+		}
 	}
-	LIST_INSERT_HEAD(&sh->cmng.mem_mngs, mem_mng, next);
-	return mem_mng;
+out:
+	rte_spinlock_unlock(&cmng->pool_update_sl);
+	return pool;
 }
 
 /**
@@ -4767,7 +4698,6 @@ struct field_modify_info modify_tcp[] = {
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
-	struct mlx5_counter_stats_mem_mng *mem_mng = NULL;
 	void *old_pools = cmng->pools;
 	uint32_t resize = cmng->n + MLX5_CNT_CONTAINER_RESIZE;
 	uint32_t mem_size = sizeof(struct mlx5_flow_counter_pool *) * resize;
@@ -4780,30 +4710,8 @@ struct field_modify_info modify_tcp[] = {
 	if (old_pools)
 		memcpy(pools, old_pools, cmng->n *
 				       sizeof(struct mlx5_flow_counter_pool *));
-	/*
-	 * Fallback mode query the counter directly, no background query
-	 * resources are needed.
-	 */
-	if (!priv->counter_fallback) {
-		int i;
-
-		mem_mng = flow_dv_create_counter_stat_mem_mng(dev,
-			  MLX5_CNT_CONTAINER_RESIZE + MLX5_MAX_PENDING_QUERIES);
-		if (!mem_mng) {
-			mlx5_free(pools);
-			return -ENOMEM;
-		}
-		for (i = 0; i < MLX5_MAX_PENDING_QUERIES; ++i)
-			LIST_INSERT_HEAD(&priv->sh->cmng.free_stat_raws,
-					 mem_mng->raws +
-					 MLX5_CNT_CONTAINER_RESIZE +
-					 i, next);
-	}
-	rte_spinlock_lock(&cmng->resize_sl);
 	cmng->n = resize;
-	cmng->mem_mng = mem_mng;
 	cmng->pools = pools;
-	rte_spinlock_unlock(&cmng->resize_sl);
 	if (old_pools)
 		mlx5_free(old_pools);
 	return 0;
@@ -4842,9 +4750,14 @@ struct field_modify_info modify_tcp[] = {
 					0, pkts, bytes, 0, NULL, NULL, 0);
 	}
 	rte_spinlock_lock(&pool->sl);
-	offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
-	*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
-	*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+	if (!pool->raw) {
+		*pkts = 0;
+		*bytes = 0;
+	} else {
+		offset = MLX5_CNT_ARRAY_IDX(pool, cnt);
+		*pkts = rte_be_to_cpu_64(pool->raw->data[offset].hits);
+		*bytes = rte_be_to_cpu_64(pool->raw->data[offset].bytes);
+	}
 	rte_spinlock_unlock(&pool->sl);
 	return 0;
 }
@@ -4871,12 +4784,9 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
-	int16_t n_valid = rte_atomic16_read(&cmng->n_valid);
 	uint32_t fallback = priv->counter_fallback;
 	uint32_t size = sizeof(*pool);
 
-	if (cmng->n == n_valid && flow_dv_container_resize(dev))
-		return NULL;
 	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
 	size += (!fallback ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
 	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE);
@@ -4885,24 +4795,26 @@ struct field_modify_info modify_tcp[] = {
 		rte_errno = ENOMEM;
 		return NULL;
 	}
-	if (!fallback) {
-		pool->min_dcs = dcs;
-		pool->raw = cmng->mem_mng->raws + n_valid %
-						      MLX5_CNT_CONTAINER_RESIZE;
-	}
-	pool->raw_hw = NULL;
+	pool->raw = NULL;
 	pool->type = 0;
-	pool->type |= (!fallback ? 0 :  CNT_POOL_TYPE_EXT);
 	pool->type |= (!age ? 0 :  CNT_POOL_TYPE_AGE);
 	pool->query_gen = 0;
+	pool->min_dcs = dcs;
 	rte_spinlock_init(&pool->sl);
+	rte_spinlock_init(&pool->csl);
 	TAILQ_INIT(&pool->counters[0]);
 	TAILQ_INIT(&pool->counters[1]);
-	TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
-	pool->index = n_valid;
 	pool->time_of_last_age_check = MLX5_CURR_TIME_SEC;
-	cmng->pools[n_valid] = pool;
-	if (fallback) {
+	rte_spinlock_lock(&cmng->pool_update_sl);
+	pool->index = cmng->n_valid;
+	if (pool->index == cmng->n && flow_dv_container_resize(dev)) {
+		mlx5_free(pool);
+		rte_spinlock_unlock(&cmng->pool_update_sl);
+		return NULL;
+	}
+	cmng->pools[pool->index] = pool;
+	cmng->n_valid++;
+	if (unlikely(fallback)) {
 		int base = RTE_ALIGN_FLOOR(dcs->id, MLX5_COUNTERS_PER_POOL);
 
 		if (base < cmng->min_id)
@@ -4910,10 +4822,9 @@ struct field_modify_info modify_tcp[] = {
 		if (base > cmng->max_id)
 			cmng->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
 		cmng->last_pool_idx = pool->index;
+		pool->type |= CNT_POOL_TYPE_EXT;
 	}
-	/* Pool initialization must be updated before host thread access. */
-	rte_io_wmb();
-	rte_atomic16_add(&cmng->n_valid, 1);
+	rte_spinlock_unlock(&cmng->pool_update_sl);
 	return pool;
 }
 
@@ -5219,12 +5130,16 @@ struct field_modify_info modify_tcp[] = {
 	 *
 	 */
 	if (!priv->counter_fallback) {
+		rte_spinlock_lock(&pool->csl);
 		TAILQ_INSERT_TAIL(&pool->counters[pool->query_gen], cnt, next);
+		rte_spinlock_unlock(&pool->csl);
 	} else {
 		cnt_type = IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
 					       MLX5_COUNTER_TYPE_ORIGIN;
+		rte_spinlock_lock(&priv->sh->cmng.csl[cnt_type]);
 		TAILQ_INSERT_TAIL(&priv->sh->cmng.counters[cnt_type],
 				  cnt, next);
+		rte_spinlock_unlock(&priv->sh->cmng.csl[cnt_type]);
 	}
 }
 
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index bd2a734..5df2209 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -259,7 +259,7 @@
 	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
 	union mlx5_l3t_data data;
-	uint32_t n_valid = rte_atomic16_read(&cmng->n_valid);
+	uint32_t n_valid = cmng->n_valid;
 	uint32_t pool_idx, cnt_idx;
 	uint32_t i;
 	int ret;
@@ -317,8 +317,7 @@
 		cnt = MLX5_POOL_GET_CNT(pool, 0);
 		cmng->pools[n_valid] = pool;
 		pool_idx = n_valid;
-		rte_atomic16_add(&cmng->n_valid, 1);
-		TAILQ_INSERT_HEAD(&cmng->pool_list, pool, next);
+		cmng->n_valid++;
 	}
 	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool_idx, i);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 5/8] net/mlx5: make three level table thread safe
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
                     ` (3 preceding siblings ...)
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 4/8] net/mlx5: synchronize flow counter pool creation Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: make shared counters " Suanming Mou
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

This commit adds thread safety support in three level table using
spinlock and reference counter for each table entry.

An new mlx5_l3t_prepare_entry() function is added in order to support
multiple-thread operation.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/mlx5_utils.c | 191 ++++++++++++++++++++++++++++++++++--------
 drivers/net/mlx5/mlx5_utils.h |  81 ++++++++++++++----
 2 files changed, 224 insertions(+), 48 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_utils.c b/drivers/net/mlx5/mlx5_utils.c
index fefe833..9a54fda 100644
--- a/drivers/net/mlx5/mlx5_utils.c
+++ b/drivers/net/mlx5/mlx5_utils.c
@@ -551,26 +551,23 @@ struct mlx5_l3t_tbl *
 	tbl->type = type;
 	switch (type) {
 	case MLX5_L3T_TYPE_WORD:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_word) +
-				  sizeof(uint16_t) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_word);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_w";
 		break;
 	case MLX5_L3T_TYPE_DWORD:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_dword) +
-				  sizeof(uint32_t) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_dword);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_dw";
 		break;
 	case MLX5_L3T_TYPE_QWORD:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_qword) +
-				  sizeof(uint64_t) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_qword);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_qw";
 		break;
 	default:
-		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_ptr) +
-				  sizeof(void *) * MLX5_L3T_ET_SIZE;
+		l3t_ip_cfg.size = sizeof(struct mlx5_l3t_entry_ptr);
 		l3t_ip_cfg.type = "mlx5_l3t_e_tbl_tpr";
 		break;
 	}
+	rte_spinlock_init(&tbl->sl);
 	tbl->eip = mlx5_ipool_create(&l3t_ip_cfg);
 	if (!tbl->eip) {
 		rte_errno = ENOMEM;
@@ -620,11 +617,15 @@ struct mlx5_l3t_tbl *
 	mlx5_free(tbl);
 }
 
-uint32_t
-mlx5_l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
-		   union mlx5_l3t_data *data)
+static int32_t
+__l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+		union mlx5_l3t_data *data)
 {
 	struct mlx5_l3t_level_tbl *g_tbl, *m_tbl;
+	struct mlx5_l3t_entry_word *w_e_tbl;
+	struct mlx5_l3t_entry_dword *dw_e_tbl;
+	struct mlx5_l3t_entry_qword *qw_e_tbl;
+	struct mlx5_l3t_entry_ptr *ptr_e_tbl;
 	void *e_tbl;
 	uint32_t entry_idx;
 
@@ -640,26 +641,46 @@ struct mlx5_l3t_tbl *
 	entry_idx = idx & MLX5_L3T_ET_MASK;
 	switch (tbl->type) {
 	case MLX5_L3T_TYPE_WORD:
-		data->word = ((struct mlx5_l3t_entry_word *)e_tbl)->entry
-			     [entry_idx];
+		w_e_tbl = (struct mlx5_l3t_entry_word *)e_tbl;
+		data->word = w_e_tbl->entry[entry_idx].data;
+		if (w_e_tbl->entry[entry_idx].data)
+			w_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_DWORD:
-		data->dword = ((struct mlx5_l3t_entry_dword *)e_tbl)->entry
-			     [entry_idx];
+		dw_e_tbl = (struct mlx5_l3t_entry_dword *)e_tbl;
+		data->dword = dw_e_tbl->entry[entry_idx].data;
+		if (dw_e_tbl->entry[entry_idx].data)
+			dw_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_QWORD:
-		data->qword = ((struct mlx5_l3t_entry_qword *)e_tbl)->entry
-			      [entry_idx];
+		qw_e_tbl = (struct mlx5_l3t_entry_qword *)e_tbl;
+		data->qword = qw_e_tbl->entry[entry_idx].data;
+		if (qw_e_tbl->entry[entry_idx].data)
+			qw_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	default:
-		data->ptr = ((struct mlx5_l3t_entry_ptr *)e_tbl)->entry
-			    [entry_idx];
+		ptr_e_tbl = (struct mlx5_l3t_entry_ptr *)e_tbl;
+		data->ptr = ptr_e_tbl->entry[entry_idx].data;
+		if (ptr_e_tbl->entry[entry_idx].data)
+			ptr_e_tbl->entry[entry_idx].ref_cnt++;
 		break;
 	}
 	return 0;
 }
 
-void
+int32_t
+mlx5_l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+		   union mlx5_l3t_data *data)
+{
+	int ret;
+
+	rte_spinlock_lock(&tbl->sl);
+	ret = __l3t_get_entry(tbl, idx, data);
+	rte_spinlock_unlock(&tbl->sl);
+	return ret;
+}
+
+int32_t
 mlx5_l3t_clear_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx)
 {
 	struct mlx5_l3t_level_tbl *g_tbl, *m_tbl;
@@ -670,36 +691,54 @@ struct mlx5_l3t_tbl *
 	void *e_tbl;
 	uint32_t entry_idx;
 	uint64_t ref_cnt;
+	int32_t ret = -1;
 
+	rte_spinlock_lock(&tbl->sl);
 	g_tbl = tbl->tbl;
 	if (!g_tbl)
-		return;
+		goto out;
 	m_tbl = g_tbl->tbl[(idx >> MLX5_L3T_GT_OFFSET) & MLX5_L3T_GT_MASK];
 	if (!m_tbl)
-		return;
+		goto out;
 	e_tbl = m_tbl->tbl[(idx >> MLX5_L3T_MT_OFFSET) & MLX5_L3T_MT_MASK];
 	if (!e_tbl)
-		return;
+		goto out;
 	entry_idx = idx & MLX5_L3T_ET_MASK;
 	switch (tbl->type) {
 	case MLX5_L3T_TYPE_WORD:
 		w_e_tbl = (struct mlx5_l3t_entry_word *)e_tbl;
-		w_e_tbl->entry[entry_idx] = 0;
+		MLX5_ASSERT(w_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --w_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		w_e_tbl->entry[entry_idx].data = 0;
 		ref_cnt = --w_e_tbl->ref_cnt;
 		break;
 	case MLX5_L3T_TYPE_DWORD:
 		dw_e_tbl = (struct mlx5_l3t_entry_dword *)e_tbl;
-		dw_e_tbl->entry[entry_idx] = 0;
+		MLX5_ASSERT(dw_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --dw_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		dw_e_tbl->entry[entry_idx].data = 0;
 		ref_cnt = --dw_e_tbl->ref_cnt;
 		break;
 	case MLX5_L3T_TYPE_QWORD:
 		qw_e_tbl = (struct mlx5_l3t_entry_qword *)e_tbl;
-		qw_e_tbl->entry[entry_idx] = 0;
+		MLX5_ASSERT(qw_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --qw_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		qw_e_tbl->entry[entry_idx].data = 0;
 		ref_cnt = --qw_e_tbl->ref_cnt;
 		break;
 	default:
 		ptr_e_tbl = (struct mlx5_l3t_entry_ptr *)e_tbl;
-		ptr_e_tbl->entry[entry_idx] = NULL;
+		MLX5_ASSERT(ptr_e_tbl->entry[entry_idx].ref_cnt);
+		ret = --ptr_e_tbl->entry[entry_idx].ref_cnt;
+		if (ret)
+			goto out;
+		ptr_e_tbl->entry[entry_idx].data = NULL;
 		ref_cnt = --ptr_e_tbl->ref_cnt;
 		break;
 	}
@@ -718,11 +757,14 @@ struct mlx5_l3t_tbl *
 			}
 		}
 	}
+out:
+	rte_spinlock_unlock(&tbl->sl);
+	return ret;
 }
 
-uint32_t
-mlx5_l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
-		   union mlx5_l3t_data *data)
+static int32_t
+__l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+		union mlx5_l3t_data *data)
 {
 	struct mlx5_l3t_level_tbl *g_tbl, *m_tbl;
 	struct mlx5_l3t_entry_word *w_e_tbl;
@@ -783,24 +825,105 @@ struct mlx5_l3t_tbl *
 	switch (tbl->type) {
 	case MLX5_L3T_TYPE_WORD:
 		w_e_tbl = (struct mlx5_l3t_entry_word *)e_tbl;
-		w_e_tbl->entry[entry_idx] = data->word;
+		if (w_e_tbl->entry[entry_idx].data) {
+			data->word = w_e_tbl->entry[entry_idx].data;
+			w_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			return -1;
+		}
+		w_e_tbl->entry[entry_idx].data = data->word;
+		w_e_tbl->entry[entry_idx].ref_cnt = 1;
 		w_e_tbl->ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_DWORD:
 		dw_e_tbl = (struct mlx5_l3t_entry_dword *)e_tbl;
-		dw_e_tbl->entry[entry_idx] = data->dword;
+		if (dw_e_tbl->entry[entry_idx].data) {
+			data->dword = dw_e_tbl->entry[entry_idx].data;
+			dw_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			return -1;
+		}
+		dw_e_tbl->entry[entry_idx].data = data->dword;
+		dw_e_tbl->entry[entry_idx].ref_cnt = 1;
 		dw_e_tbl->ref_cnt++;
 		break;
 	case MLX5_L3T_TYPE_QWORD:
 		qw_e_tbl = (struct mlx5_l3t_entry_qword *)e_tbl;
-		qw_e_tbl->entry[entry_idx] = data->qword;
+		if (qw_e_tbl->entry[entry_idx].data) {
+			data->qword = qw_e_tbl->entry[entry_idx].data;
+			qw_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			return -1;
+		}
+		qw_e_tbl->entry[entry_idx].data = data->qword;
+		qw_e_tbl->entry[entry_idx].ref_cnt = 1;
 		qw_e_tbl->ref_cnt++;
 		break;
 	default:
 		ptr_e_tbl = (struct mlx5_l3t_entry_ptr *)e_tbl;
-		ptr_e_tbl->entry[entry_idx] = data->ptr;
+		if (ptr_e_tbl->entry[entry_idx].data) {
+			data->ptr = ptr_e_tbl->entry[entry_idx].data;
+			ptr_e_tbl->entry[entry_idx].ref_cnt++;
+			rte_errno = EEXIST;
+			return -1;
+		}
+		ptr_e_tbl->entry[entry_idx].data = data->ptr;
+		ptr_e_tbl->entry[entry_idx].ref_cnt = 1;
 		ptr_e_tbl->ref_cnt++;
 		break;
 	}
 	return 0;
 }
+
+int32_t
+mlx5_l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+		   union mlx5_l3t_data *data)
+{
+	int ret;
+
+	rte_spinlock_lock(&tbl->sl);
+	ret = __l3t_set_entry(tbl, idx, data);
+	rte_spinlock_unlock(&tbl->sl);
+	return ret;
+}
+
+int32_t
+mlx5_l3t_prepare_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+		       union mlx5_l3t_data *data,
+		       mlx5_l3t_alloc_callback_fn cb, void *ctx)
+{
+	int32_t ret;
+
+	rte_spinlock_lock(&tbl->sl);
+	/* Check if entry data is ready. */
+	ret = __l3t_get_entry(tbl, idx, data);
+	if (!ret) {
+		switch (tbl->type) {
+		case MLX5_L3T_TYPE_WORD:
+			if (data->word)
+				goto out;
+			break;
+		case MLX5_L3T_TYPE_DWORD:
+			if (data->dword)
+				goto out;
+			break;
+		case MLX5_L3T_TYPE_QWORD:
+			if (data->qword)
+				goto out;
+			break;
+		default:
+			if (data->ptr)
+				goto out;
+			break;
+		}
+	}
+	/* Entry data is not ready, use user callback to create it. */
+	ret = cb(ctx, data);
+	if (ret)
+		goto out;
+	/* Save the new allocated data to entry. */
+	ret = __l3t_set_entry(tbl, idx, data);
+out:
+	rte_spinlock_unlock(&tbl->sl);
+	return ret;
+}
diff --git a/drivers/net/mlx5/mlx5_utils.h b/drivers/net/mlx5/mlx5_utils.h
index f078bdc..ca9bb76 100644
--- a/drivers/net/mlx5/mlx5_utils.h
+++ b/drivers/net/mlx5/mlx5_utils.h
@@ -118,29 +118,41 @@ struct mlx5_l3t_level_tbl {
 struct mlx5_l3t_entry_word {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	uint16_t entry[]; /* Entry array. */
-};
+	struct {
+		uint16_t data;
+		uint32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 double word entry table data structure. */
 struct mlx5_l3t_entry_dword {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	uint32_t entry[]; /* Entry array. */
-};
+	struct {
+		uint32_t data;
+		int32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 quad word entry table data structure. */
 struct mlx5_l3t_entry_qword {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	uint64_t entry[]; /* Entry array. */
-};
+	struct {
+		uint64_t data;
+		uint32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 pointer entry table data structure. */
 struct mlx5_l3t_entry_ptr {
 	uint32_t idx; /* Table index. */
 	uint64_t ref_cnt; /* Table ref_cnt. */
-	void *entry[]; /* Entry array. */
-};
+	struct {
+		void *data;
+		uint32_t ref_cnt;
+	} entry[MLX5_L3T_ET_SIZE]; /* Entry array */
+} __rte_packed;
 
 /* L3 table data structure. */
 struct mlx5_l3t_tbl {
@@ -148,8 +160,13 @@ struct mlx5_l3t_tbl {
 	struct mlx5_indexed_pool *eip;
 	/* Table index pool handles. */
 	struct mlx5_l3t_level_tbl *tbl; /* Global table index. */
+	rte_spinlock_t sl; /* The table lock. */
 };
 
+/** Type of function that is used to handle the data before freeing. */
+typedef int32_t (*mlx5_l3t_alloc_callback_fn)(void *ctx,
+					   union mlx5_l3t_data *data);
+
 /*
  * The indexed memory entry index is made up of trunk index and offset of
  * the entry in the trunk. Since the entry index is 32 bits, in case user
@@ -535,32 +552,68 @@ struct mlx5_indexed_pool *
  *   0 if success, -1 on error.
  */
 
-uint32_t mlx5_l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+int32_t mlx5_l3t_get_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
 			    union mlx5_l3t_data *data);
+
 /**
- * This function clears the index entry from Three-level table.
+ * This function gets the index entry from Three-level table.
+ *
+ * If the index entry is not available, allocate new one by callback
+ * function and fill in the entry.
  *
  * @param tbl
  *   Pointer to the l3t.
  * @param idx
  *   Index to the entry.
+ * @param data
+ *   Pointer to the memory which saves the entry data.
+ *   When function call returns 0, data contains the entry data get from
+ *   l3t.
+ *   When function call returns -1, data is not modified.
+ * @param cb
+ *   Callback function to allocate new data.
+ * @param ctx
+ *   Context for callback function.
+ *
+ * @return
+ *   0 if success, -1 on error.
  */
-void mlx5_l3t_clear_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx);
+
+int32_t mlx5_l3t_prepare_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+			       union mlx5_l3t_data *data,
+			       mlx5_l3t_alloc_callback_fn cb, void *ctx);
 
 /**
- * This function gets the index entry from Three-level table.
+ * This function decreases and clear index entry if reference
+ * counter is 0 from Three-level table.
  *
  * @param tbl
  *   Pointer to the l3t.
  * @param idx
  *   Index to the entry.
- * @param data
+ *
+ * @return
+ *   The remaining reference count, 0 means entry be cleared, -1 on error.
+ */
+int32_t mlx5_l3t_clear_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx);
+
+/**
+ * This function sets the index entry to Three-level table.
+ * If the entry is already set, the EEXIST errno will be given, and
+ * the set data will be filled to the data.
+ *
+ * @param tbl[in]
+ *   Pointer to the l3t.
+ * @param idx[in]
+ *   Index to the entry.
+ * @param data[in/out]
  *   Pointer to the memory which contains the entry data save to l3t.
+ *   If the entry is already set, the set data will be filled.
  *
  * @return
  *   0 if success, -1 on error.
  */
-uint32_t mlx5_l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
+int32_t mlx5_l3t_set_entry(struct mlx5_l3t_tbl *tbl, uint32_t idx,
 			    union mlx5_l3t_data *data);
 
 /*
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 6/8] net/mlx5: make shared counters thread safe
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
                     ` (4 preceding siblings ...)
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 5/8] net/mlx5: make three level table thread safe Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: rename flow counter macro Suanming Mou
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

The shared counters save the counter index to three level table. As
three level table supports multiple-thread opertations now, the shared
counters can take advantage of the table to support multiple-thread.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/mlx5.h            |   7 ++-
 drivers/net/mlx5/mlx5_flow_dv.c    | 124 +++++++++++++++++++------------------
 drivers/net/mlx5/mlx5_flow_verbs.c |  19 ++----
 3 files changed, 73 insertions(+), 77 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index e314668..2598fa2 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -348,10 +348,15 @@ struct flow_counter_stats {
 
 /* Shared counters information for counters. */
 struct mlx5_flow_counter_shared {
-	uint32_t ref_cnt; /**< Reference counter. */
 	uint32_t id; /**< User counter ID. */
 };
 
+/* Shared counter configuration. */
+struct mlx5_shared_counter_conf {
+	struct rte_eth_dev *dev; /* The device shared counter belongs to. */
+	uint32_t id; /* The shared counter ID. */
+};
+
 struct mlx5_flow_counter_pool;
 /* Generic counters information. */
 struct mlx5_flow_counter {
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 6226d87..067ef0f 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4903,36 +4903,10 @@ struct field_modify_info modify_tcp[] = {
 }
 
 /**
- * Search for existed shared counter.
- *
- * @param[in] dev
- *   Pointer to the Ethernet device structure.
- * @param[in] id
- *   The shared counter ID to search.
- *
- * @return
- *   0 if not existed, otherwise shared counter index.
- */
-static uint32_t
-flow_dv_counter_shared_search(struct rte_eth_dev *dev, uint32_t id)
-{
-	struct mlx5_priv *priv = dev->data->dev_private;
-	union mlx5_l3t_data data;
-
-	if (mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data))
-		return 0;
-	return data.dword;
-}
-
-/**
  * Allocate a flow counter.
  *
  * @param[in] dev
  *   Pointer to the Ethernet device structure.
- * @param[in] shared
- *   Indicate if this counter is shared with other flows.
- * @param[in] id
- *   Counter identifier.
  * @param[in] age
  *   Whether the counter was allocated for aging.
  *
@@ -4940,8 +4914,7 @@ struct field_modify_info modify_tcp[] = {
  *   Index to flow counter on success, 0 otherwise and rte_errno is set.
  */
 static uint32_t
-flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id,
-		      uint32_t age)
+flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t age)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool = NULL;
@@ -4957,19 +4930,6 @@ struct field_modify_info modify_tcp[] = {
 		rte_errno = ENOTSUP;
 		return 0;
 	}
-	if (shared) {
-		cnt_idx = flow_dv_counter_shared_search(dev, id);
-		if (cnt_idx) {
-			cnt_free = flow_dv_counter_get_by_idx(dev, cnt_idx,
-							      NULL);
-			if (cnt_free->shared_info.ref_cnt + 1 == 0) {
-				rte_errno = E2BIG;
-				return 0;
-			}
-			cnt_free->shared_info.ref_cnt++;
-			return cnt_idx;
-		}
-	}
 	/* Get free counters from container. */
 	rte_spinlock_lock(&cmng->csl[cnt_type]);
 	cnt_free = TAILQ_FIRST(&cmng->counters[cnt_type]);
@@ -5007,16 +4967,6 @@ struct field_modify_info modify_tcp[] = {
 	if (_flow_dv_query_count(dev, cnt_idx, &cnt_free->hits,
 				 &cnt_free->bytes))
 		goto err;
-	if (shared) {
-		union mlx5_l3t_data data;
-
-		data.dword = cnt_idx;
-		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
-			goto err;
-		cnt_free->shared_info.ref_cnt = 1;
-		cnt_free->shared_info.id = id;
-		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
-	}
 	if (!fallback && !priv->sh->cmng.query_thread_on)
 		/* Start the asynchronous batch query by the host thread. */
 		mlx5_set_query_alarm(priv->sh);
@@ -5032,6 +4982,60 @@ struct field_modify_info modify_tcp[] = {
 }
 
 /**
+ * Allocate a shared flow counter.
+ *
+ * @param[in] ctx
+ *   Pointer to the shared counter configuration.
+ * @param[in] data
+ *   Pointer to save the allocated counter index.
+ *
+ * @return
+ *   Index to flow counter on success, 0 otherwise and rte_errno is set.
+ */
+
+static int32_t
+flow_dv_counter_alloc_shared_cb(void *ctx, union mlx5_l3t_data *data)
+{
+	struct mlx5_shared_counter_conf *conf = ctx;
+	struct rte_eth_dev *dev = conf->dev;
+	struct mlx5_flow_counter *cnt;
+
+	data->dword = flow_dv_counter_alloc(dev, 0);
+	data->dword |= MLX5_CNT_SHARED_OFFSET;
+	cnt = flow_dv_counter_get_by_idx(dev, data->dword, NULL);
+	cnt->shared_info.id = conf->id;
+	return 0;
+}
+
+/**
+ * Get a shared flow counter.
+ *
+ * @param[in] dev
+ *   Pointer to the Ethernet device structure.
+ * @param[in] id
+ *   Counter identifier.
+ *
+ * @return
+ *   Index to flow counter on success, 0 otherwise and rte_errno is set.
+ */
+static uint32_t
+flow_dv_counter_get_shared(struct rte_eth_dev *dev, uint32_t id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_shared_counter_conf conf = {
+		.dev = dev,
+		.id = id,
+	};
+	union mlx5_l3t_data data = {
+		.dword = 0,
+	};
+
+	mlx5_l3t_prepare_entry(priv->sh->cnt_id_tbl, id, &data,
+			       flow_dv_counter_alloc_shared_cb, &conf);
+	return data.dword;
+}
+
+/**
  * Get age param from counter index.
  *
  * @param[in] dev
@@ -5110,12 +5114,9 @@ struct field_modify_info modify_tcp[] = {
 		return;
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (IS_SHARED_CNT(counter)) {
-		if (--cnt->shared_info.ref_cnt)
-			return;
-		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
-				     cnt->shared_info.id);
-	}
+	if (IS_SHARED_CNT(counter) &&
+	    mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl, cnt->shared_info.id))
+		return;
 	if (IS_AGE_POOL(pool))
 		flow_dv_counter_remove_from_age(dev, counter, cnt);
 	cnt->pool = pool;
@@ -8271,9 +8272,10 @@ struct field_modify_info modify_tcp[] = {
 	uint32_t counter;
 	struct mlx5_age_param *age_param;
 
-	counter = flow_dv_counter_alloc(dev,
-				count ? count->shared : 0,
-				count ? count->id : 0, !!age);
+	if (count && count->shared)
+		counter = flow_dv_counter_get_shared(dev, count->id);
+	else
+		counter = flow_dv_counter_alloc(dev, !!age);
 	if (!counter || age == NULL)
 		return counter;
 	age_param  = flow_dv_counter_idx_get_age(dev, counter);
@@ -11442,7 +11444,7 @@ struct field_modify_info modify_tcp[] = {
 	uint32_t cnt;
 
 	flow_dv_shared_lock(dev);
-	cnt = flow_dv_counter_alloc(dev, 0, 0, 0);
+	cnt = flow_dv_counter_alloc(dev, 0);
 	flow_dv_shared_unlock(dev);
 	return cnt;
 }
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 5df2209..1fd5972 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -265,15 +265,8 @@
 	int ret;
 
 	if (shared && !mlx5_l3t_get_entry(priv->sh->cnt_id_tbl, id, &data) &&
-	    data.dword) {
-		cnt = flow_verbs_counter_get_by_idx(dev, data.dword, NULL);
-		if (cnt->shared_info.ref_cnt + 1 == 0) {
-			rte_errno = E2BIG;
-			return 0;
-		}
-		cnt->shared_info.ref_cnt++;
+	    data.dword)
 		return data.dword;
-	}
 	for (pool_idx = 0; pool_idx < n_valid; ++pool_idx) {
 		pool = cmng->pools[pool_idx];
 		if (!pool)
@@ -325,7 +318,6 @@
 		data.dword = cnt_idx;
 		if (mlx5_l3t_set_entry(priv->sh->cnt_id_tbl, id, &data))
 			return 0;
-		cnt->shared_info.ref_cnt = 1;
 		cnt->shared_info.id = id;
 		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
@@ -360,12 +352,9 @@
 	struct mlx5_flow_counter_ext *cnt_ext;
 
 	cnt = flow_verbs_counter_get_by_idx(dev, counter, &pool);
-	if (IS_SHARED_CNT(counter)) {
-		if (--cnt->shared_info.ref_cnt)
-			return;
-		mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl,
-				     cnt->shared_info.id);
-	}
+	if (IS_SHARED_CNT(counter) &&
+	    mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl, cnt->shared_info.id))
+		return;
 	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 	claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 7/8] net/mlx5: rename flow counter macro
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
                     ` (5 preceding siblings ...)
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: make shared counters " Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: optimize counter extend memory Suanming Mou
  2020-10-20 22:59   ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter thread safe Raslan Darawsheh
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

Add the MLX5_ prefix to the defined counter macro names.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/mlx5.h            | 22 +++++++++++-----------
 drivers/net/mlx5/mlx5_flow_dv.c    | 10 +++++-----
 drivers/net/mlx5/mlx5_flow_verbs.c |  2 +-
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2598fa2..9638ab2 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -276,19 +276,19 @@ struct mlx5_drop {
 #define IS_SHARED_CNT(cnt) (!!((cnt) & MLX5_CNT_SHARED_OFFSET))
 #define IS_BATCH_CNT(cnt) (((cnt) & (MLX5_CNT_SHARED_OFFSET - 1)) >= \
 			   MLX5_CNT_BATCH_OFFSET)
-#define CNT_SIZE (sizeof(struct mlx5_flow_counter))
-#define CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
-#define AGE_SIZE (sizeof(struct mlx5_age_param))
-#define CNT_POOL_TYPE_EXT	(1 << 0)
-#define CNT_POOL_TYPE_AGE	(1 << 1)
+#define MLX5_CNT_SIZE (sizeof(struct mlx5_flow_counter))
+#define MLX5_CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
+#define MLX5_AGE_SIZE (sizeof(struct mlx5_age_param))
+#define MLX5_CNT_POOL_TYPE_EXT (1 << 0)
+#define MLX5_CNT_POOL_TYPE_AGE (1 << 1)
 
-#define IS_EXT_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_EXT)
-#define IS_AGE_POOL(pool) (((pool)->type) & CNT_POOL_TYPE_AGE)
+#define IS_EXT_POOL(pool) (((pool)->type) & MLX5_CNT_POOL_TYPE_EXT)
+#define IS_AGE_POOL(pool) (((pool)->type) & MLX5_CNT_POOL_TYPE_AGE)
 
 #define MLX5_CNT_LEN(pool) \
-	(CNT_SIZE + \
-	(IS_AGE_POOL(pool) ? AGE_SIZE : 0) + \
-	(IS_EXT_POOL(pool) ? CNTEXT_SIZE : 0))
+	(MLX5_CNT_SIZE + \
+	(IS_AGE_POOL(pool) ? MLX5_AGE_SIZE : 0) + \
+	(IS_EXT_POOL(pool) ? MLX5_CNTEXT_SIZE : 0))
 #define MLX5_POOL_GET_CNT(pool, index) \
 	((struct mlx5_flow_counter *) \
 	((uint8_t *)((pool) + 1) + (index) * (MLX5_CNT_LEN(pool))))
@@ -306,7 +306,7 @@ struct mlx5_drop {
 #define MLX5_CNT_TO_CNT_EXT(pool, cnt) \
 	((struct mlx5_flow_counter_ext *)\
 	((uint8_t *)((cnt) + 1) + \
-	(IS_AGE_POOL(pool) ? AGE_SIZE : 0)))
+	(IS_AGE_POOL(pool) ? MLX5_AGE_SIZE : 0)))
 #define MLX5_GET_POOL_CNT_EXT(pool, offset) \
 	MLX5_CNT_TO_CNT_EXT(pool, MLX5_POOL_GET_CNT((pool), (offset)))
 #define MLX5_CNT_TO_AGE(cnt) \
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 067ef0f..d302a83 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4787,9 +4787,9 @@ struct field_modify_info modify_tcp[] = {
 	uint32_t fallback = priv->counter_fallback;
 	uint32_t size = sizeof(*pool);
 
-	size += MLX5_COUNTERS_PER_POOL * CNT_SIZE;
-	size += (!fallback ? 0 : MLX5_COUNTERS_PER_POOL * CNTEXT_SIZE);
-	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * AGE_SIZE);
+	size += MLX5_COUNTERS_PER_POOL * MLX5_CNT_SIZE;
+	size += (!fallback ? 0 : MLX5_COUNTERS_PER_POOL * MLX5_CNTEXT_SIZE);
+	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * MLX5_AGE_SIZE);
 	pool = mlx5_malloc(MLX5_MEM_ZERO, size, 0, SOCKET_ID_ANY);
 	if (!pool) {
 		rte_errno = ENOMEM;
@@ -4797,7 +4797,7 @@ struct field_modify_info modify_tcp[] = {
 	}
 	pool->raw = NULL;
 	pool->type = 0;
-	pool->type |= (!age ? 0 :  CNT_POOL_TYPE_AGE);
+	pool->type |= (!age ? 0 :  MLX5_CNT_POOL_TYPE_AGE);
 	pool->query_gen = 0;
 	pool->min_dcs = dcs;
 	rte_spinlock_init(&pool->sl);
@@ -4822,7 +4822,7 @@ struct field_modify_info modify_tcp[] = {
 		if (base > cmng->max_id)
 			cmng->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
 		cmng->last_pool_idx = pool->index;
-		pool->type |= CNT_POOL_TYPE_EXT;
+		pool->type |= MLX5_CNT_POOL_TYPE_EXT;
 	}
 	rte_spinlock_unlock(&cmng->pool_update_sl);
 	return pool;
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 1fd5972..0bb17b5 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -302,7 +302,7 @@
 		pool = mlx5_malloc(MLX5_MEM_ZERO, size, 0, SOCKET_ID_ANY);
 		if (!pool)
 			return 0;
-		pool->type |= CNT_POOL_TYPE_EXT;
+		pool->type |= MLX5_CNT_POOL_TYPE_EXT;
 		for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
 			cnt = MLX5_POOL_GET_CNT(pool, i);
 			TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [dpdk-dev] [PATCH v2 8/8] net/mlx5: optimize counter extend memory
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
                     ` (6 preceding siblings ...)
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: rename flow counter macro Suanming Mou
@ 2020-10-20  3:02   ` Suanming Mou
  2020-10-20 22:59   ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter thread safe Raslan Darawsheh
  8 siblings, 0 replies; 17+ messages in thread
From: Suanming Mou @ 2020-10-20  3:02 UTC (permalink / raw)
  To: Matan Azrad, Shahaf Shuler, Viacheslav Ovsiienko; +Cc: dev

Counter extend memory was allocated for non-batch counter to save the
extra DevX object. Currently, for non-batch counter which does not
support aging, entry in the generic counter struct is used only when
counter is free in free list, and bytes in the struct is used only when
counter is allocated in using.

In this case, the DevX object can be saved to the generic counter struct
union with entry memory when counter is allocated and union with bytes
when counter is free.
And pool type is also not needed as non-fallback mode only has generic
counter and aging counter, just a bit to indicate the pool is aged or
not will be enough.

This eliminates the counter extend info struct saves the memory.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c   | 22 +++++++++-----
 drivers/net/mlx5/mlx5.c            | 18 ++++++-----
 drivers/net/mlx5/mlx5.h            | 60 ++++++++++++++++++------------------
 drivers/net/mlx5/mlx5_flow.c       |  6 ++--
 drivers/net/mlx5/mlx5_flow_dv.c    | 38 +++++++++++------------
 drivers/net/mlx5/mlx5_flow_verbs.c | 62 +++++++++++++++++---------------------
 6 files changed, 100 insertions(+), 106 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 6e33b2b..457008e 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -573,24 +573,30 @@
 {
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	bool fallback;
 
-	/* If devx is not supported or not DV mode, counters are not working. */
-	if (!priv->config.devx || !priv->config.dv_flow_en)
-		return;
 #ifndef HAVE_IBV_DEVX_ASYNC
-	priv->counter_fallback = 1;
+	fallback = true;
 #else
-	priv->counter_fallback = 0;
-	if (!priv->config.hca_attr.flow_counters_dump ||
+	fallback = false;
+	if (!priv->config.devx || !priv->config.dv_flow_en ||
+	    !priv->config.hca_attr.flow_counters_dump ||
 	    !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) ||
 	    (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP))
-		priv->counter_fallback = 1;
+		fallback = true;
 #endif
-	if (priv->counter_fallback)
+	if (fallback)
 		DRV_LOG(INFO, "Use fall-back DV counter management. Flow "
 			"counter dump:%d, bulk_alloc_bitmap:0x%hhx.",
 			priv->config.hca_attr.flow_counters_dump,
 			priv->config.hca_attr.flow_counter_bulk_alloc_bitmap);
+	/* Initialize fallback mode only on the port initializes sh. */
+	if (sh->refcnt == 1)
+		sh->cmng.counter_fallback = fallback;
+	else if (fallback != sh->cmng.counter_fallback)
+		DRV_LOG(WARNING, "Port %d in sh has different fallback mode "
+			"with others:%d.", PORT_ID(priv), fallback);
 #endif
 }
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index e805723..e4ce9a9 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -535,23 +535,25 @@ struct mlx5_flow_id_pool *
 	if (sh->cmng.pools) {
 		struct mlx5_flow_counter_pool *pool;
 		uint16_t n_valid = sh->cmng.n_valid;
+		bool fallback = sh->cmng.counter_fallback;
 
 		for (i = 0; i < n_valid; ++i) {
 			pool = sh->cmng.pools[i];
-			if (!IS_EXT_POOL(pool) && pool->min_dcs)
+			if (!fallback && pool->min_dcs)
 				claim_zero(mlx5_devx_cmd_destroy
 							       (pool->min_dcs));
 			for (j = 0; j < MLX5_COUNTERS_PER_POOL; ++j) {
-				if (MLX5_POOL_GET_CNT(pool, j)->action)
+				struct mlx5_flow_counter *cnt =
+						MLX5_POOL_GET_CNT(pool, j);
+
+				if (cnt->action)
 					claim_zero
 					 (mlx5_glue->destroy_flow_action
-					  (MLX5_POOL_GET_CNT
-					  (pool, j)->action));
-				if (IS_EXT_POOL(pool) && MLX5_GET_POOL_CNT_EXT
-				    (pool, j)->dcs)
+					  (cnt->action));
+				if (fallback && MLX5_POOL_GET_CNT
+				    (pool, j)->dcs_when_free)
 					claim_zero(mlx5_devx_cmd_destroy
-						   (MLX5_GET_POOL_CNT_EXT
-						    (pool, j)->dcs));
+						   (cnt->dcs_when_free));
 			}
 			mlx5_free(pool);
 		}
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 9638ab2..fa69c66 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -277,18 +277,11 @@ struct mlx5_drop {
 #define IS_BATCH_CNT(cnt) (((cnt) & (MLX5_CNT_SHARED_OFFSET - 1)) >= \
 			   MLX5_CNT_BATCH_OFFSET)
 #define MLX5_CNT_SIZE (sizeof(struct mlx5_flow_counter))
-#define MLX5_CNTEXT_SIZE (sizeof(struct mlx5_flow_counter_ext))
 #define MLX5_AGE_SIZE (sizeof(struct mlx5_age_param))
-#define MLX5_CNT_POOL_TYPE_EXT (1 << 0)
-#define MLX5_CNT_POOL_TYPE_AGE (1 << 1)
-
-#define IS_EXT_POOL(pool) (((pool)->type) & MLX5_CNT_POOL_TYPE_EXT)
-#define IS_AGE_POOL(pool) (((pool)->type) & MLX5_CNT_POOL_TYPE_AGE)
 
 #define MLX5_CNT_LEN(pool) \
 	(MLX5_CNT_SIZE + \
-	(IS_AGE_POOL(pool) ? MLX5_AGE_SIZE : 0) + \
-	(IS_EXT_POOL(pool) ? MLX5_CNTEXT_SIZE : 0))
+	((pool)->is_aged ? MLX5_AGE_SIZE : 0))
 #define MLX5_POOL_GET_CNT(pool, index) \
 	((struct mlx5_flow_counter *) \
 	((uint8_t *)((pool) + 1) + (index) * (MLX5_CNT_LEN(pool))))
@@ -303,12 +296,6 @@ struct mlx5_drop {
  */
 #define MLX5_MAKE_CNT_IDX(pi, offset) \
 	((pi) * MLX5_COUNTERS_PER_POOL + (offset) + 1)
-#define MLX5_CNT_TO_CNT_EXT(pool, cnt) \
-	((struct mlx5_flow_counter_ext *)\
-	((uint8_t *)((cnt) + 1) + \
-	(IS_AGE_POOL(pool) ? MLX5_AGE_SIZE : 0)))
-#define MLX5_GET_POOL_CNT_EXT(pool, offset) \
-	MLX5_CNT_TO_CNT_EXT(pool, MLX5_POOL_GET_CNT((pool), (offset)))
 #define MLX5_CNT_TO_AGE(cnt) \
 	((struct mlx5_age_param *)((cnt) + 1))
 /*
@@ -368,30 +355,41 @@ struct mlx5_flow_counter {
 		 * to the aging list. For shared counter, only when it is
 		 * released, the TAILQ entry memory will be used, at that
 		 * time, shared memory is not used anymore.
+		 *
+		 * Similarly to none-batch counter dcs, since it doesn't
+		 * support aging, while counter is allocated, the entry
+		 * memory is not used anymore. In this case, as bytes
+		 * memory is used only when counter is allocated, and
+		 * entry memory is used only when counter is free. The
+		 * dcs pointer can be saved to these two different place
+		 * at different stage. It will eliminate the individual
+		 * counter extend struct.
 		 */
 		TAILQ_ENTRY(mlx5_flow_counter) next;
 		/**< Pointer to the next flow counter structure. */
-		struct mlx5_flow_counter_shared shared_info;
-		/**< Shared counter information. */
+		struct {
+			struct mlx5_flow_counter_shared shared_info;
+			/**< Shared counter information. */
+			void *dcs_when_active;
+			/*
+			 * For non-batch mode, the dcs will be saved
+			 * here when the counter is free.
+			 */
+		};
 	};
 	union {
 		uint64_t hits; /**< Reset value of hits packets. */
 		struct mlx5_flow_counter_pool *pool; /**< Counter pool. */
 	};
-	uint64_t bytes; /**< Reset value of bytes. */
-	void *action; /**< Pointer to the dv action. */
-};
-
-/* Extend counters information for none batch fallback counters. */
-struct mlx5_flow_counter_ext {
 	union {
-#if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
-		struct ibv_counter_set *cs;
-#elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
-		struct ibv_counters *cs;
-#endif
-		struct mlx5_devx_obj *dcs; /**< Counter Devx object. */
+		uint64_t bytes; /**< Reset value of bytes. */
+		void *dcs_when_free;
+		/*
+		 * For non-batch mode, the dcs will be saved here
+		 * when the counter is free.
+		 */
 	};
+	void *action; /**< Pointer to the dv action. */
 };
 
 TAILQ_HEAD(mlx5_counters, mlx5_flow_counter);
@@ -407,8 +405,8 @@ struct mlx5_flow_counter_pool {
 	/* The devx object of the minimum counter ID. */
 	uint64_t time_of_last_age_check;
 	/* System time (from rte_rdtsc()) read in the last aging check. */
-	uint32_t index:29; /* Pool index in container. */
-	uint32_t type:2; /* Memory type behind the counter array. */
+	uint32_t index:30; /* Pool index in container. */
+	uint32_t is_aged:1; /* Pool with aging counter. */
 	volatile uint32_t query_gen:1; /* Query round. */
 	rte_spinlock_t sl; /* The pool lock. */
 	rte_spinlock_t csl; /* The pool counter free list lock. */
@@ -454,6 +452,7 @@ struct mlx5_flow_counter_mng {
 	uint16_t pool_index;
 	uint8_t query_thread_on;
 	bool relaxed_ordering;
+	bool counter_fallback; /* Use counter fallback management. */
 	LIST_HEAD(mem_mngs, mlx5_counter_stats_mem_mng) mem_mngs;
 	LIST_HEAD(stat_raws, mlx5_counter_stats_raw) free_stat_raws;
 };
@@ -826,7 +825,6 @@ struct mlx5_priv {
 	unsigned int master:1; /* Device is a E-Switch master. */
 	unsigned int dr_shared:1; /* DV/DR data is shared. */
 	unsigned int txpp_en:1; /* Tx packet pacing enabled. */
-	unsigned int counter_fallback:1; /* Use counter fallback management. */
 	unsigned int mtr_en:1; /* Whether support meter. */
 	unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */
 	unsigned int sampler_en:1; /* Whether support sampler. */
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c79d02e..22fb4ee 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -6876,14 +6876,14 @@ struct mlx5_meter_domains_infos *
 	uint8_t query_gen = pool->query_gen ^ 1;
 	struct mlx5_flow_counter_mng *cmng = &sh->cmng;
 	enum mlx5_counter_type cnt_type =
-		IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
-				    MLX5_COUNTER_TYPE_ORIGIN;
+		pool->is_aged ? MLX5_COUNTER_TYPE_AGE :
+				MLX5_COUNTER_TYPE_ORIGIN;
 
 	if (unlikely(status)) {
 		raw_to_free = pool->raw_hw;
 	} else {
 		raw_to_free = pool->raw;
-		if (IS_AGE_POOL(pool))
+		if (pool->is_aged)
 			mlx5_flow_aging_check(sh, pool);
 		rte_spinlock_lock(&pool->sl);
 		pool->raw = pool->raw_hw;
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index d302a83..49d9636 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -4170,7 +4170,7 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	const struct rte_flow_action_age *age = action->conf;
 
-	if (!priv->config.devx || priv->counter_fallback)
+	if (!priv->config.devx || priv->sh->cmng.counter_fallback)
 		return rte_flow_error_set(error, ENOTSUP,
 					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
 					  NULL,
@@ -4739,16 +4739,13 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt;
-	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	int offset;
 
 	cnt = flow_dv_counter_get_by_idx(dev, counter, &pool);
 	MLX5_ASSERT(pool);
-	if (priv->counter_fallback) {
-		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-		return mlx5_devx_cmd_flow_counter_query(cnt_ext->dcs, 0,
+	if (priv->sh->cmng.counter_fallback)
+		return mlx5_devx_cmd_flow_counter_query(cnt->dcs_when_active, 0,
 					0, pkts, bytes, 0, NULL, NULL, 0);
-	}
 	rte_spinlock_lock(&pool->sl);
 	if (!pool->raw) {
 		*pkts = 0;
@@ -4784,11 +4781,10 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
-	uint32_t fallback = priv->counter_fallback;
+	bool fallback = priv->sh->cmng.counter_fallback;
 	uint32_t size = sizeof(*pool);
 
 	size += MLX5_COUNTERS_PER_POOL * MLX5_CNT_SIZE;
-	size += (!fallback ? 0 : MLX5_COUNTERS_PER_POOL * MLX5_CNTEXT_SIZE);
 	size += (!age ? 0 : MLX5_COUNTERS_PER_POOL * MLX5_AGE_SIZE);
 	pool = mlx5_malloc(MLX5_MEM_ZERO, size, 0, SOCKET_ID_ANY);
 	if (!pool) {
@@ -4796,8 +4792,7 @@ struct field_modify_info modify_tcp[] = {
 		return NULL;
 	}
 	pool->raw = NULL;
-	pool->type = 0;
-	pool->type |= (!age ? 0 :  MLX5_CNT_POOL_TYPE_AGE);
+	pool->is_aged = !!age;
 	pool->query_gen = 0;
 	pool->min_dcs = dcs;
 	rte_spinlock_init(&pool->sl);
@@ -4822,7 +4817,6 @@ struct field_modify_info modify_tcp[] = {
 		if (base > cmng->max_id)
 			cmng->max_id = base + MLX5_COUNTERS_PER_POOL - 1;
 		cmng->last_pool_idx = pool->index;
-		pool->type |= MLX5_CNT_POOL_TYPE_EXT;
 	}
 	rte_spinlock_unlock(&cmng->pool_update_sl);
 	return pool;
@@ -4855,7 +4849,7 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_flow_counter *cnt;
 	enum mlx5_counter_type cnt_type =
 			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
-	uint32_t fallback = priv->counter_fallback;
+	bool fallback = priv->sh->cmng.counter_fallback;
 	uint32_t i;
 
 	if (fallback) {
@@ -4874,7 +4868,7 @@ struct field_modify_info modify_tcp[] = {
 		i = dcs->id % MLX5_COUNTERS_PER_POOL;
 		cnt = MLX5_POOL_GET_CNT(pool, i);
 		cnt->pool = pool;
-		MLX5_GET_POOL_CNT_EXT(pool, i)->dcs = dcs;
+		cnt->dcs_when_free = dcs;
 		*cnt_free = cnt;
 		return pool;
 	}
@@ -4919,8 +4913,7 @@ struct field_modify_info modify_tcp[] = {
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool = NULL;
 	struct mlx5_flow_counter *cnt_free = NULL;
-	struct mlx5_flow_counter_ext *cnt_ext = NULL;
-	uint32_t fallback = priv->counter_fallback;
+	bool fallback = priv->sh->cmng.counter_fallback;
 	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	enum mlx5_counter_type cnt_type =
 			age ? MLX5_COUNTER_TYPE_AGE : MLX5_COUNTER_TYPE_ORIGIN;
@@ -4940,7 +4933,7 @@ struct field_modify_info modify_tcp[] = {
 		goto err;
 	pool = cnt_free->pool;
 	if (fallback)
-		cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt_free);
+		cnt_free->dcs_when_active = cnt_free->dcs_when_free;
 	/* Create a DV counter action only in the first time usage. */
 	if (!cnt_free->action) {
 		uint16_t offset;
@@ -4952,7 +4945,7 @@ struct field_modify_info modify_tcp[] = {
 			dcs = pool->min_dcs;
 		} else {
 			offset = 0;
-			dcs = cnt_ext->dcs;
+			dcs = cnt_free->dcs_when_free;
 		}
 		ret = mlx5_flow_os_create_flow_action_count(dcs->obj, offset,
 							    &cnt_free->action);
@@ -4974,6 +4967,8 @@ struct field_modify_info modify_tcp[] = {
 err:
 	if (cnt_free) {
 		cnt_free->pool = pool;
+		if (fallback)
+			cnt_free->dcs_when_free = cnt_free->dcs_when_active;
 		rte_spinlock_lock(&cmng->csl[cnt_type]);
 		TAILQ_INSERT_TAIL(&cmng->counters[cnt_type], cnt_free, next);
 		rte_spinlock_unlock(&cmng->csl[cnt_type]);
@@ -5117,7 +5112,7 @@ struct field_modify_info modify_tcp[] = {
 	if (IS_SHARED_CNT(counter) &&
 	    mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl, cnt->shared_info.id))
 		return;
-	if (IS_AGE_POOL(pool))
+	if (pool->is_aged)
 		flow_dv_counter_remove_from_age(dev, counter, cnt);
 	cnt->pool = pool;
 	/*
@@ -5130,13 +5125,14 @@ struct field_modify_info modify_tcp[] = {
 	 * function both operate with the different list.
 	 *
 	 */
-	if (!priv->counter_fallback) {
+	if (!priv->sh->cmng.counter_fallback) {
 		rte_spinlock_lock(&pool->csl);
 		TAILQ_INSERT_TAIL(&pool->counters[pool->query_gen], cnt, next);
 		rte_spinlock_unlock(&pool->csl);
 	} else {
-		cnt_type = IS_AGE_POOL(pool) ? MLX5_COUNTER_TYPE_AGE :
-					       MLX5_COUNTER_TYPE_ORIGIN;
+		cnt->dcs_when_free = cnt->dcs_when_active;
+		cnt_type = pool->is_aged ? MLX5_COUNTER_TYPE_AGE :
+					   MLX5_COUNTER_TYPE_ORIGIN;
 		rte_spinlock_lock(&priv->sh->cmng.csl[cnt_type]);
 		TAILQ_INSERT_TAIL(&priv->sh->cmng.counters[cnt_type],
 				  cnt, next);
diff --git a/drivers/net/mlx5/mlx5_flow_verbs.c b/drivers/net/mlx5/mlx5_flow_verbs.c
index 0bb17b5..710622c 100644
--- a/drivers/net/mlx5/mlx5_flow_verbs.c
+++ b/drivers/net/mlx5/mlx5_flow_verbs.c
@@ -186,16 +186,16 @@
  */
 static int
 flow_verbs_counter_create(struct rte_eth_dev *dev,
-			  struct mlx5_flow_counter_ext *counter)
+			  struct mlx5_flow_counter *counter)
 {
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct ibv_context *ctx = priv->sh->ctx;
 	struct ibv_counter_set_init_attr init = {
-			 .counter_set_id = counter->id};
+			 .counter_set_id = counter->shared_info.id};
 
-	counter->cs = mlx5_glue->create_counter_set(ctx, &init);
-	if (!counter->cs) {
+	counter->dcs_when_free = mlx5_glue->create_counter_set(ctx, &init);
+	if (!counter->dcs_when_free) {
 		rte_errno = ENOTSUP;
 		return -ENOTSUP;
 	}
@@ -208,23 +208,23 @@
 	int ret;
 
 	memset(&attach, 0, sizeof(attach));
-	counter->cs = mlx5_glue->create_counters(ctx, &init);
-	if (!counter->cs) {
+	counter->dcs_when_free = mlx5_glue->create_counters(ctx, &init);
+	if (!counter->dcs_when_free) {
 		rte_errno = ENOTSUP;
 		return -ENOTSUP;
 	}
 	attach.counter_desc = IBV_COUNTER_PACKETS;
 	attach.index = 0;
-	ret = mlx5_glue->attach_counters(counter->cs, &attach, NULL);
+	ret = mlx5_glue->attach_counters(counter->dcs_when_free, &attach, NULL);
 	if (!ret) {
 		attach.counter_desc = IBV_COUNTER_BYTES;
 		attach.index = 1;
 		ret = mlx5_glue->attach_counters
-					(counter->cs, &attach, NULL);
+					(counter->dcs_when_free, &attach, NULL);
 	}
 	if (ret) {
-		claim_zero(mlx5_glue->destroy_counters(counter->cs));
-		counter->cs = NULL;
+		claim_zero(mlx5_glue->destroy_counters(counter->dcs_when_free));
+		counter->dcs_when_free = NULL;
 		rte_errno = ret;
 		return -ret;
 	}
@@ -256,7 +256,6 @@
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_mng *cmng = &priv->sh->cmng;
 	struct mlx5_flow_counter_pool *pool = NULL;
-	struct mlx5_flow_counter_ext *cnt_ext = NULL;
 	struct mlx5_flow_counter *cnt = NULL;
 	union mlx5_l3t_data data;
 	uint32_t n_valid = cmng->n_valid;
@@ -297,12 +296,10 @@
 			cmng->n += MLX5_CNT_CONTAINER_RESIZE;
 		}
 		/* Allocate memory for new pool*/
-		size = sizeof(*pool) + (sizeof(*cnt_ext) + sizeof(*cnt)) *
-		       MLX5_COUNTERS_PER_POOL;
+		size = sizeof(*pool) + sizeof(*cnt) * MLX5_COUNTERS_PER_POOL;
 		pool = mlx5_malloc(MLX5_MEM_ZERO, size, 0, SOCKET_ID_ANY);
 		if (!pool)
 			return 0;
-		pool->type |= MLX5_CNT_POOL_TYPE_EXT;
 		for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) {
 			cnt = MLX5_POOL_GET_CNT(pool, i);
 			TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
@@ -312,6 +309,7 @@
 		pool_idx = n_valid;
 		cmng->n_valid++;
 	}
+	TAILQ_REMOVE(&pool->counters[0], cnt, next);
 	i = MLX5_CNT_ARRAY_IDX(pool, cnt);
 	cnt_idx = MLX5_MAKE_CNT_IDX(pool_idx, i);
 	if (shared) {
@@ -321,15 +319,15 @@
 		cnt->shared_info.id = id;
 		cnt_idx |= MLX5_CNT_SHARED_OFFSET;
 	}
-	cnt_ext = MLX5_GET_POOL_CNT_EXT(pool, i);
-	cnt->hits = 0;
-	cnt->bytes = 0;
 	/* Create counter with Verbs. */
-	ret = flow_verbs_counter_create(dev, cnt_ext);
+	ret = flow_verbs_counter_create(dev, cnt);
 	if (!ret) {
-		TAILQ_REMOVE(&pool->counters[0], cnt, next);
+		cnt->dcs_when_active = cnt->dcs_when_free;
+		cnt->hits = 0;
+		cnt->bytes = 0;
 		return cnt_idx;
 	}
+	TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
 	/* Some error occurred in Verbs library. */
 	rte_errno = -ret;
 	return 0;
@@ -349,21 +347,18 @@
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_flow_counter *cnt;
-	struct mlx5_flow_counter_ext *cnt_ext;
 
 	cnt = flow_verbs_counter_get_by_idx(dev, counter, &pool);
 	if (IS_SHARED_CNT(counter) &&
 	    mlx5_l3t_clear_entry(priv->sh->cnt_id_tbl, cnt->shared_info.id))
 		return;
-	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
-	claim_zero(mlx5_glue->destroy_counter_set(cnt_ext->cs));
-	cnt_ext->cs = NULL;
+	claim_zero(mlx5_glue->destroy_counter_set
+			((struct ibv_counter_set *)cnt->dcs_when_active));
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
-	claim_zero(mlx5_glue->destroy_counters(cnt_ext->cs));
-	cnt_ext->cs = NULL;
+	claim_zero(mlx5_glue->destroy_counters
+				((struct ibv_counters *)cnt->dcs_when_active));
 #endif
-	(void)cnt_ext;
 	TAILQ_INSERT_HEAD(&pool->counters[0], cnt, next);
 }
 
@@ -384,13 +379,12 @@
 		struct mlx5_flow_counter_pool *pool;
 		struct mlx5_flow_counter *cnt = flow_verbs_counter_get_by_idx
 						(dev, flow->counter, &pool);
-		struct mlx5_flow_counter_ext *cnt_ext = MLX5_CNT_TO_CNT_EXT
-						(pool, cnt);
 		struct rte_flow_query_count *qc = data;
 		uint64_t counters[2] = {0, 0};
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 		struct ibv_query_counter_set_attr query_cs_attr = {
-			.cs = cnt_ext->cs,
+			.dcs_when_free = (struct ibv_counter_set *)
+						cnt->dcs_when_active,
 			.query_flags = IBV_COUNTER_SET_FORCE_UPDATE,
 		};
 		struct ibv_counter_set_data query_out = {
@@ -401,7 +395,7 @@
 						       &query_out);
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
 		int err = mlx5_glue->query_counters
-			       (cnt_ext->cs, counters,
+			((struct ibv_counters *)cnt->dcs_when_active, counters,
 				RTE_DIM(counters),
 				IBV_READ_COUNTERS_ATTR_PREFER_CACHED);
 #endif
@@ -1188,7 +1182,6 @@
 	defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
 	struct mlx5_flow_counter_pool *pool;
 	struct mlx5_flow_counter *cnt = NULL;
-	struct mlx5_flow_counter_ext *cnt_ext;
 	unsigned int size = sizeof(struct ibv_flow_spec_counter_action);
 	struct ibv_flow_spec_counter_action counter = {
 		.type = IBV_FLOW_SPEC_ACTION_COUNT,
@@ -1208,13 +1201,12 @@
 	}
 #if defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42)
 	cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool);
-	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-	counter.counter_set_handle = cnt_ext->cs->handle;
+	counter.counter_set_handle =
+		((struct ibv_counter_set *)cnt->dcs_when_active)->handle;
 	flow_verbs_spec_add(&dev_flow->verbs, &counter, size);
 #elif defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
 	cnt = flow_verbs_counter_get_by_idx(dev, flow->counter, &pool);
-	cnt_ext = MLX5_CNT_TO_CNT_EXT(pool, cnt);
-	counter.counters = cnt_ext->cs;
+	counter.counters = (struct ibv_counters *)cnt->dcs_when_active;
 	flow_verbs_spec_add(&dev_flow->verbs, &counter, size);
 #endif
 	return 0;
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter thread safe
  2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
                     ` (7 preceding siblings ...)
  2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: optimize counter extend memory Suanming Mou
@ 2020-10-20 22:59   ` Raslan Darawsheh
  8 siblings, 0 replies; 17+ messages in thread
From: Raslan Darawsheh @ 2020-10-20 22:59 UTC (permalink / raw)
  To: Suanming Mou; +Cc: dev

Hi,
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Suanming Mou
> Sent: Tuesday, October 20, 2020 6:02 AM
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter thread safe
> 
> The mlx5 PMD is going to support multiple-thread flow operations.
> This patchset makes the counter action to be thread safe.
> 
> Suanming Mou (8):
>   net/mlx5: locate aging pools in the general container
>   net/mlx5: optimize shared counter memory
>   net/mlx5: remove single counter container
>   net/mlx5: synchronize flow counter pool creation
>   net/mlx5: make three level table thread safe
>   net/mlx5: make shared counters thread safe
>   net/mlx5: rename flow counter macro
>   net/mlx5: optimize counter extend memory
> 
> --
> 
> v2:
>  - Optimize the fallback non-batch counter memory.
>  - Add MLX5_ prefix to counter macro.
>  - Rebase on top of the latest code.
> 
> --
> 
>  drivers/net/mlx5/linux/mlx5_os.c   |  53 ++-
>  drivers/net/mlx5/mlx5.c            |  51 ++-
>  drivers/net/mlx5/mlx5.h            | 146 ++++----
>  drivers/net/mlx5/mlx5_flow.c       | 183 ++++++----
>  drivers/net/mlx5/mlx5_flow.h       |   1 +
>  drivers/net/mlx5/mlx5_flow_dv.c    | 706 ++++++++++++++--------------------
> ---
>  drivers/net/mlx5/mlx5_flow_verbs.c | 130 ++++---
>  drivers/net/mlx5/mlx5_utils.c      | 191 ++++++++--
>  drivers/net/mlx5/mlx5_utils.h      |  81 ++++-
>  9 files changed, 824 insertions(+), 718 deletions(-)
> 
> --
> 1.8.3.1

Series applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-10-20 22:59 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-06 11:38 [dpdk-dev] [PATCH 0/6] net/mlx5: make counter thread safe Suanming Mou
2020-10-06 11:38 ` [dpdk-dev] [PATCH 1/6] net/mlx5: locate aging pools in the general container Suanming Mou
2020-10-06 11:38 ` [dpdk-dev] [PATCH 2/6] net/mlx5: optimize shared counter memory Suanming Mou
2020-10-06 11:38 ` [dpdk-dev] [PATCH 3/6] net/mlx5: remove single counter container Suanming Mou
2020-10-06 11:38 ` [dpdk-dev] [PATCH 4/6] net/mlx5: synchronize flow counter pool creation Suanming Mou
2020-10-06 11:38 ` [dpdk-dev] [PATCH 5/6] net/mlx5: make three level table thread safe Suanming Mou
2020-10-06 11:38 ` [dpdk-dev] [PATCH 6/6] net/mlx5: make shared counters " Suanming Mou
2020-10-20  3:02 ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter " Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 1/8] net/mlx5: locate aging pools in the general container Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 2/8] net/mlx5: optimize shared counter memory Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 3/8] net/mlx5: remove single counter container Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 4/8] net/mlx5: synchronize flow counter pool creation Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 5/8] net/mlx5: make three level table thread safe Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 6/8] net/mlx5: make shared counters " Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 7/8] net/mlx5: rename flow counter macro Suanming Mou
2020-10-20  3:02   ` [dpdk-dev] [PATCH v2 8/8] net/mlx5: optimize counter extend memory Suanming Mou
2020-10-20 22:59   ` [dpdk-dev] [PATCH v2 0/8] net/mlx5: make counter thread safe Raslan Darawsheh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).