DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH v1 0/2] DLB Enqueue Reorder Support
@ 2024-06-21 20:12 Abdullah Sevincer
  2024-06-21 20:12 ` [PATCH v1 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
  2024-06-21 20:12 ` [PATCH v1 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  0 siblings, 2 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 20:12 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This patchset implements DLB enqueue reorder support.

v1: Initial patchset

Abdullah Sevincer (2):
  event/dlb2: add support for enqueue reordering
  eventdev: add support for enqueue reorder

 doc/guides/rel_notes/release_24_07.rst |   4 +
 drivers/event/dlb2/dlb2.c              | 505 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_frag.h         |  13 +
 drivers/event/dlb2/dlb2_inline_fns.h   |  28 ++
 drivers/event/dlb2/dlb2_priv.h         |  22 +-
 drivers/event/dlb2/meson_options.txt   |   6 +
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 lib/eventdev/rte_eventdev.h            |   8 +
 9 files changed, 433 insertions(+), 190 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_frag.h
 create mode 100644 drivers/event/dlb2/meson_options.txt

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v1 1/2] event/dlb2: add support for enqueue reordering
  2024-06-21 20:12 [PATCH v1 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
@ 2024-06-21 20:12 ` Abdullah Sevincer
  2024-06-21 20:51   ` [PATCH v2 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
  2024-06-21 22:24   ` [PATCH v3 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
  2024-06-21 20:12 ` [PATCH v1 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  1 sibling, 2 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 20:12 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions reorder support is needed for DLB PMD so that it
restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave same as older releases. To enable reordering feature,
applications need to add RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER flag
to port configuration.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst |   4 +
 drivers/event/dlb2/dlb2.c              | 505 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_frag.h         |  13 +
 drivers/event/dlb2/dlb2_inline_fns.h   |  28 ++
 drivers/event/dlb2/dlb2_priv.h         |  22 +-
 drivers/event/dlb2/meson_options.txt   |   6 +
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 8 files changed, 425 insertions(+), 190 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_frag.h
 create mode 100644 drivers/event/dlb2/meson_options.txt

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 7c88de381b..1a9a6c2db5 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -144,6 +144,10 @@ New Features
 
   Added an API that allows the user to reclaim the defer queue with RCU.
 
+* **Updated DLB2 Driver for enqueue reordering feature**
+  * Added support for DLB reordering feature. Applications should use
+    ``RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER`` to enable the feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..0eadc9a489 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,10 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
+/* These functions will vary based on processor capabilities */
+static struct dlb2_port_low_level_io_functions qm_mmio_fns;
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -98,6 +102,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +313,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1454,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1561,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1633,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1988,7 +1998,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,10 +2806,34 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
-	dlb2_movdir64b(port_data->pp_addr, qe4);
+	qm_mmio_fns.pp_enqueue_four(port_data->pp_addr, qe4);
+}
+
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			qm_mmio_fns.pp_enqueue_four(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	qm_mmio_fns.pp_enqueue_four(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
 }
 
 static inline int
@@ -2815,7 +2853,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2874,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3025,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3201,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3581,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3617,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3625,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3804,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3823,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3878,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3909,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3924,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4271,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4279,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4322,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4330,36 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		if (ev_port->outstanding_releases != 0)
+			rte_panic("Still outstanding releases %d\n", ev_port->outstanding_releases);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4384,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4411,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4560,6 +4706,17 @@ dlb2_entry_points_init(struct rte_eventdev *dev)
 	}
 }
 
+static void
+dlb2_qm_mmio_fn_init(void)
+{
+	/* Process-local function pointers for performing low level port i/o */
+
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIR64B))
+		qm_mmio_fns.pp_enqueue_four = dlb2_movdir64b;
+	else
+		qm_mmio_fns.pp_enqueue_four = dlb2_movntdq;
+}
+
 int
 dlb2_primary_eventdev_probe(struct rte_eventdev *dev,
 			    const char *name,
@@ -4674,6 +4831,8 @@ dlb2_primary_eventdev_probe(struct rte_eventdev *dev,
 
 	rte_spinlock_init(&dlb2->qm_instance.resource_lock);
 
+	dlb2_qm_mmio_fn_init();
+
 	dlb2_iface_low_level_io_init();
 
 	dlb2_entry_points_init(dev);
@@ -4706,6 +4865,8 @@ dlb2_secondary_eventdev_probe(struct rte_eventdev *dev,
 		return err;
 	}
 
+	dlb2_qm_mmio_fn_init();
+
 	dlb2_iface_low_level_io_init();
 
 	dlb2_entry_points_init(dev);
@@ -4939,6 +5100,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_frag.h b/drivers/event/dlb2/dlb2_frag.h
new file mode 100644
index 0000000000..18163d31b1
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_frag.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2017 Intel Corporation
+ */
+
+#ifndef _DLB2_FRAG_H_
+#define _DLB2_FRAG_H_
+
+/* Fragments/partials are not supported by the API, but the capability is in
+ * the PMD in case future support is added.
+ */
+#define RTE_EVENT_DLB2_OP_FRAG 3
+
+#endif	/* _DLB2_FRAG_H_ */
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..0568bec549 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,32 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
+static inline void
+dlb2_movntdq(void *pp_addr, void *qe4)
+{
+	/* Move entire 64B cache line of QEs, 128 bits (16B) at a time. */
+	long long *_qe  = (long long *)qe4;
+
+	__v2di src_data0 = (__v2di){_qe[0], _qe[1]};
+	__v2di src_data1 = (__v2di){_qe[2], _qe[3]};
+	__v2di src_data2 = (__v2di){_qe[4], _qe[5]};
+	__v2di src_data3 = (__v2di){_qe[6], _qe[7]};
+
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 0, (__v2di)src_data0);
+	rte_wmb();
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 1, (__v2di)src_data1);
+	rte_wmb();
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 2, (__v2di)src_data2);
+	rte_wmb();
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 3, (__v2di)src_data3);
+	rte_wmb();
+}
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..07a6b12f9c 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -387,8 +387,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +657,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +678,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/meson_options.txt b/drivers/event/dlb2/meson_options.txt
new file mode 100644
index 0000000000..69be6f41c1
--- /dev/null
+++ b/drivers/event/dlb2/meson_options.txt
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2023-2024 Intel Corporation
+
+DLB2_BYPASS_FENCE_ON_PP = 0
+DLB_HW_CREDITS_CHECKS = 0
+DLB_SW_CREDITS_CHECKS = 1
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v1 2/2] eventdev: add support for enqueue reorder
  2024-06-21 20:12 [PATCH v1 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
  2024-06-21 20:12 ` [PATCH v1 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
@ 2024-06-21 20:12 ` Abdullah Sevincer
  1 sibling, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 20:12 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support flag to enable enqueue reorder
feature.

When this flag is enabled in the port configuration PMD
restores dequeue order on enqueue if applications happen to
change it.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 lib/eventdev/rte_eventdev.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..f4220dd5dc 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -1073,6 +1073,14 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *  @see rte_event_port_setup()
  */
 
+#define RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER   (1ULL << 5)
+/**< Flag to enable feature enqueue reordering to dequeue.
+ * The feature restores dequeue order on enqueue if applications
+ * happen to change the order.
+ *
+ *  @see rte_event_port_setup()
+ */
+
 /** Event port configuration structure */
 struct rte_event_port_conf {
 	int32_t new_event_threshold;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v2 0/2] DLB Enqueue Reorder Support
  2024-06-21 20:12 ` [PATCH v1 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
@ 2024-06-21 20:51   ` Abdullah Sevincer
  2024-06-21 20:51     ` [PATCH v2 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
  2024-06-21 20:51     ` [PATCH v2 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  2024-06-21 22:24   ` [PATCH v3 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
  1 sibling, 2 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 20:51 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This patchset implements DLB enqueue reorder support.

v2: Fix CI issue.
v1: Initial patchset

Abdullah Sevincer (2):
  event/dlb2: add support for enqueue reordering
  eventdev: add support for enqueue reorder

 doc/guides/rel_notes/release_24_07.rst |   4 +
 drivers/event/dlb2/dlb2.c              | 504 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |  28 ++
 drivers/event/dlb2/dlb2_priv.h         |  22 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 lib/eventdev/rte_eventdev.h            |   8 +
 7 files changed, 413 insertions(+), 190 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v2 1/2] event/dlb2: add support for enqueue reordering
  2024-06-21 20:51   ` [PATCH v2 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
@ 2024-06-21 20:51     ` Abdullah Sevincer
  2024-06-21 20:51     ` [PATCH v2 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  1 sibling, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 20:51 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions reorder support is needed for DLB PMD so that it
restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave same as older releases. To enable reordering feature,
applications need to add RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER flag
to port configuration.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst |   4 +
 drivers/event/dlb2/dlb2.c              | 504 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |  28 ++
 drivers/event/dlb2/dlb2_priv.h         |  22 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 6 files changed, 405 insertions(+), 190 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 7c88de381b..1a9a6c2db5 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -144,6 +144,10 @@ New Features
 
   Added an API that allows the user to reclaim the defer queue with RCU.
 
+* **Updated DLB2 Driver for enqueue reordering feature**
+  * Added support for DLB reordering feature. Applications should use
+    ``RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER`` to enable the feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..dd2b45bab0 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,10 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
+/* These functions will vary based on processor capabilities */
+static struct dlb2_port_low_level_io_functions qm_mmio_fns;
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -98,6 +102,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +313,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1454,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1561,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1633,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1988,7 +1998,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,10 +2806,34 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
-	dlb2_movdir64b(port_data->pp_addr, qe4);
+	qm_mmio_fns.pp_enqueue_four(port_data->pp_addr, qe4);
+}
+
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			qm_mmio_fns.pp_enqueue_four(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	qm_mmio_fns.pp_enqueue_four(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
 }
 
 static inline int
@@ -2815,7 +2853,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2874,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3025,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3201,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3581,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3617,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3625,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3804,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3823,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3878,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3909,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3924,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4271,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4279,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4322,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4330,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4383,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4410,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4560,6 +4705,17 @@ dlb2_entry_points_init(struct rte_eventdev *dev)
 	}
 }
 
+static void
+dlb2_qm_mmio_fn_init(void)
+{
+	/* Process-local function pointers for performing low level port i/o */
+
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIR64B))
+		qm_mmio_fns.pp_enqueue_four = dlb2_movdir64b;
+	else
+		qm_mmio_fns.pp_enqueue_four = dlb2_movntdq;
+}
+
 int
 dlb2_primary_eventdev_probe(struct rte_eventdev *dev,
 			    const char *name,
@@ -4674,6 +4830,8 @@ dlb2_primary_eventdev_probe(struct rte_eventdev *dev,
 
 	rte_spinlock_init(&dlb2->qm_instance.resource_lock);
 
+	dlb2_qm_mmio_fn_init();
+
 	dlb2_iface_low_level_io_init();
 
 	dlb2_entry_points_init(dev);
@@ -4706,6 +4864,8 @@ dlb2_secondary_eventdev_probe(struct rte_eventdev *dev,
 		return err;
 	}
 
+	dlb2_qm_mmio_fn_init();
+
 	dlb2_iface_low_level_io_init();
 
 	dlb2_entry_points_init(dev);
@@ -4939,6 +5099,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..0568bec549 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,32 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
+static inline void
+dlb2_movntdq(void *pp_addr, void *qe4)
+{
+	/* Move entire 64B cache line of QEs, 128 bits (16B) at a time. */
+	long long *_qe  = (long long *)qe4;
+
+	__v2di src_data0 = (__v2di){_qe[0], _qe[1]};
+	__v2di src_data1 = (__v2di){_qe[2], _qe[3]};
+	__v2di src_data2 = (__v2di){_qe[4], _qe[5]};
+	__v2di src_data3 = (__v2di){_qe[6], _qe[7]};
+
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 0, (__v2di)src_data0);
+	rte_wmb();
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 1, (__v2di)src_data1);
+	rte_wmb();
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 2, (__v2di)src_data2);
+	rte_wmb();
+	__builtin_ia32_movntdq((__v2di *)pp_addr + 3, (__v2di)src_data3);
+	rte_wmb();
+}
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..07a6b12f9c 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -387,8 +387,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +657,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +678,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v2 2/2] eventdev: add support for enqueue reorder
  2024-06-21 20:51   ` [PATCH v2 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
  2024-06-21 20:51     ` [PATCH v2 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
@ 2024-06-21 20:51     ` Abdullah Sevincer
  1 sibling, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 20:51 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support flag to enable enqueue reorder
feature.

When this flag is enabled in the port configuration PMD
restores dequeue order on enqueue if applications happen to
change it.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 lib/eventdev/rte_eventdev.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..f4220dd5dc 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -1073,6 +1073,14 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *  @see rte_event_port_setup()
  */
 
+#define RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER   (1ULL << 5)
+/**< Flag to enable feature enqueue reordering to dequeue.
+ * The feature restores dequeue order on enqueue if applications
+ * happen to change the order.
+ *
+ *  @see rte_event_port_setup()
+ */
+
 /** Event port configuration structure */
 struct rte_event_port_conf {
 	int32_t new_event_threshold;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v3 0/2] DLB Enqueue Reorder Support
  2024-06-21 20:12 ` [PATCH v1 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
  2024-06-21 20:51   ` [PATCH v2 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
@ 2024-06-21 22:24   ` Abdullah Sevincer
  2024-06-21 22:24     ` [PATCH v3 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  1 sibling, 2 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 22:24 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This patchset implements DLB enqueue reorder support.

v3: Fix build issues.
v2: Fix CI issues.
v1: Initial patchset.

Abdullah Sevincer (2):
  event/dlb2: add support for enqueue reordering
  eventdev: add support for enqueue reorder

 doc/guides/rel_notes/release_24_07.rst |   4 +
 drivers/event/dlb2/dlb2.c              | 484 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  22 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 lib/eventdev/rte_eventdev.h            |   8 +
 7 files changed, 374 insertions(+), 189 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v3 1/2] event/dlb2: add support for enqueue reordering
  2024-06-21 22:24   ` [PATCH v3 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
@ 2024-06-21 22:24     ` Abdullah Sevincer
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  1 sibling, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 22:24 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions reorder support is needed for DLB PMD so that it
restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave same as older releases. To enable reordering feature,
applications need to add RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER flag
to port configuration.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst |   4 +
 drivers/event/dlb2/dlb2.c              | 484 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  22 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 6 files changed, 366 insertions(+), 189 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 7c88de381b..1a9a6c2db5 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -144,6 +144,10 @@ New Features
 
   Added an API that allows the user to reclaim the defer queue with RCU.
 
+* **Updated DLB2 Driver for enqueue reordering feature**
+  * Added support for DLB reordering feature. Applications should use
+    ``RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER`` to enable the feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..13eb75d7b1 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -98,6 +99,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +310,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1451,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1558,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1630,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1988,7 +1995,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2803,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2850,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2871,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3022,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3198,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3578,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3614,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3622,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3801,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3820,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3875,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3906,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3921,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4268,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4276,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4319,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4327,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4380,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4407,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5081,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..07a6b12f9c 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -387,8 +387,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +657,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +678,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-06-21 22:24   ` [PATCH v3 0/2] DLB Enqueue Reorder Support Abdullah Sevincer
  2024-06-21 22:24     ` [PATCH v3 1/2] event/dlb2: add support for enqueue reordering Abdullah Sevincer
@ 2024-06-21 22:24     ` Abdullah Sevincer
  2024-06-24  8:28       ` Jerin Jacob
                         ` (5 more replies)
  1 sibling, 6 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-06-21 22:24 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support flag to enable enqueue reorder
feature.

When this flag is enabled in the port configuration PMD
restores dequeue order on enqueue if applications happen to
change it.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 lib/eventdev/rte_eventdev.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..f4220dd5dc 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -1073,6 +1073,14 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *  @see rte_event_port_setup()
  */
 
+#define RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER   (1ULL << 5)
+/**< Flag to enable feature enqueue reordering to dequeue.
+ * The feature restores dequeue order on enqueue if applications
+ * happen to change the order.
+ *
+ *  @see rte_event_port_setup()
+ */
+
 /** Event port configuration structure */
 struct rte_event_port_conf {
 	int32_t new_event_threshold;
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
@ 2024-06-24  8:28       ` Jerin Jacob
  2024-06-26 18:31         ` Sevincer, Abdullah
  2024-07-01  8:24       ` Mattias Rönnblom
                         ` (4 subsequent siblings)
  5 siblings, 1 reply; 99+ messages in thread
From: Jerin Jacob @ 2024-06-24  8:28 UTC (permalink / raw)
  To: Abdullah Sevincer
  Cc: dev, jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On Sat, Jun 22, 2024 at 4:02 AM Abdullah Sevincer
<abdullah.sevincer@intel.com> wrote:
>
> This commit adds support flag to enable enqueue reorder
> feature.
>
> When this flag is enabled in the port configuration PMD
> restores dequeue order on enqueue if applications happen to
> change it.
>
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> ---
>  lib/eventdev/rte_eventdev.h | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> index 08e5f9320b..f4220dd5dc 100644
> --- a/lib/eventdev/rte_eventdev.h
> +++ b/lib/eventdev/rte_eventdev.h
> @@ -1073,6 +1073,14 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
>   *  @see rte_event_port_setup()
>   */
>
> +#define RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER   (1ULL << 5)
> +/**< Flag to enable feature enqueue reordering to dequeue.
> + * The feature restores dequeue order on enqueue if applications
> + * happen to change the order.

# Is this feature or limitation?
# What is the use case for this feature?
# If application don't care about ORDER, they can use
RTE_SCHED_TYPE_PARALLEL. Right?
# Can you share the feature in the context of the below text in specification?

----------------
/* Scheduler type definitions */
#define RTE_SCHED_TYPE_ORDERED          0
/**< Ordered scheduling
 *
 * Events from an ordered flow of an event queue can be scheduled to multiple
 * ports for concurrent processing while maintaining the original event order,
 * i.e. the order in which they were first enqueued to that queue.
 * This scheme allows events pertaining to the same, potentially large, flow to
 * be processed in parallel on multiple cores without incurring any
 * application-level order restoration logic overhead.
 *
 * After events are dequeued from a set of ports, as those events are
re-enqueued
 * to another queue (with the op field set to @ref
RTE_EVENT_OP_FORWARD), the event
 * device restores the original event order - including events returned from all
 * ports in the set - before the events are placed on the destination queue,
 * for subsequent scheduling to ports

-----------------


> + *
> + *  @see rte_event_port_setup()
> + */
> +
>  /** Event port configuration structure */
>  struct rte_event_port_conf {
>         int32_t new_event_threshold;
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-06-24  8:28       ` Jerin Jacob
@ 2024-06-26 18:31         ` Sevincer, Abdullah
  2024-06-27 13:13           ` Jerin Jacob
  0 siblings, 1 reply; 99+ messages in thread
From: Sevincer, Abdullah @ 2024-06-26 18:31 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, jerinj, Richardson, Bruce, Pathak, Pravin, mattias.ronnblom,
	Aggarwal, Manish

Hi Jerin my responses below:
>+# Is this feature or limitation?
This is a new feature to enable enqueuing to PMD in any order when the underlined hardware device needs enqueues in a strict dequeue order.
>+# What is the use case for this feature?
This is needed by the applications which processes events in batches based on their flow ids. Received burst is sorted based on flow ids.
>+# If application don't care about ORDER, they can use RTE_SCHED_TYPE_PARALLEL. Right?
This is the ordering between enqueue and dequeue and not across cores.
>+# Can you share the feature in the context of the below text in specification?
Since the feature is not across cores below context does not apply.

>+----------------
>+/* Scheduler type definitions */
>+#define RTE_SCHED_TYPE_ORDERED          0
>+/**< Ordered scheduling
>+ *
>+ * Events from an ordered flow of an event queue can be scheduled to multiple
>+ * ports for concurrent processing while maintaining the original event order,
>+ * i.e. the order in which they were first enqueued to that queue.
>+ * This scheme allows events pertaining to the same, potentially large, flow to
>+ * be processed in parallel on multiple cores without incurring any
>+* application-level order restoration logic overhead.
>+ *
>+* After events are dequeued from a set of ports, as those events are re-enqueued
>+* to another queue (with the op field set to @ref RTE_EVENT_OP_FORWARD), the event
>+* device restores the original event order - including events returned from all
>+* ports in the set - before the events are placed on the destination queue,
>+* for subsequent scheduling to ports


> + *
> + *  @see rte_event_port_setup()
> + */
> +
>  /** Event port configuration structure */  struct rte_event_port_conf 
> {
>         int32_t new_event_threshold;
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-06-26 18:31         ` Sevincer, Abdullah
@ 2024-06-27 13:13           ` Jerin Jacob
  0 siblings, 0 replies; 99+ messages in thread
From: Jerin Jacob @ 2024-06-27 13:13 UTC (permalink / raw)
  To: Sevincer, Abdullah
  Cc: dev, jerinj, Richardson, Bruce, Pathak, Pravin, mattias.ronnblom,
	Aggarwal, Manish, Pavan Nikhilesh, Shijith Thotton,
	Hemant Agrawal, Sachin Saxena, Van Haaren, Harry, Liang Ma,
	Peter Mccarthy

On Thu, Jun 27, 2024 at 12:01 AM Sevincer, Abdullah
<abdullah.sevincer@intel.com> wrote:
>
> Hi Jerin my responses below:
> >+# Is this feature or limitation?
> This is a new feature to enable enqueuing to PMD in any order when the underlined hardware device needs enqueues in a strict dequeue order.
> >+# What is the use case for this feature?
> This is needed by the applications which processes events in batches based on their flow ids. Received burst is sorted based on flow ids.


OK. It is not clear from the Doxygen comment, add more details in
comment in next version, especially if it is applicable for batch
mode.
In general, the concept looks good to me.

Add new  RTE_EVENT_DEV_CAP_* for this feature.
Update doc/guides/eventdevs/features/default.ini and your PMD feature list.

Adding other eventdev PMD maintainers if there are any comments.

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  2024-06-24  8:28       ` Jerin Jacob
@ 2024-07-01  8:24       ` Mattias Rönnblom
  2024-07-01  8:50       ` Mattias Rönnblom
                         ` (3 subsequent siblings)
  5 siblings, 0 replies; 99+ messages in thread
From: Mattias Rönnblom @ 2024-07-01  8:24 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On 2024-06-22 00:24, Abdullah Sevincer wrote:
> This commit adds support flag to enable enqueue reorder
> feature.
> 

"Enqueue reorder" is how this feature is implemented (on DLB2), but it's 
not a good description of what it does (or, allows for).

I've called this feature "independent enqueue" in the past. I have a 
vague memory of someone from Intel calling it something else 
("out-of-order enqueue" maybe?), but I can't seem to be able to find 
that e-mail.

> When this flag is enabled in the port configuration PMD
> restores dequeue order on enqueue if applications happen to
> change it.
> 

If this feature is enabled, the application is free to enqueue events in 
any order, while still maintaining ordered/atomic semantics. That's how 
I would characterize it.

You may also want to note that the DPDK dispatcher library depends on 
this flag to function properly on burst-capable event devices.

This patch set should also include a patch to DSW, where it advertises 
this capability.

Ideally, you should also include a patch to the dispatcher library, 
which checks for this flag on RTE_EVENT_DEV_CAP_BURST_MODE event devices.

> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> ---
>   lib/eventdev/rte_eventdev.h | 8 ++++++++
>   1 file changed, 8 insertions(+)
> 
> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> index 08e5f9320b..f4220dd5dc 100644
> --- a/lib/eventdev/rte_eventdev.h
> +++ b/lib/eventdev/rte_eventdev.h
> @@ -1073,6 +1073,14 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
>    *  @see rte_event_port_setup()
>    */
>   
> +#define RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER   (1ULL << 5)
> +/**< Flag to enable feature enqueue reordering to dequeue.
> + * The feature restores dequeue order on enqueue if applications
> + * happen to change the order.
> + *
> + *  @see rte_event_port_setup()
> + */
> +
>   /** Event port configuration structure */
>   struct rte_event_port_conf {
>   	int32_t new_event_threshold;

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
  2024-06-24  8:28       ` Jerin Jacob
  2024-07-01  8:24       ` Mattias Rönnblom
@ 2024-07-01  8:50       ` Mattias Rönnblom
  2024-07-02 17:25         ` Pathak, Pravin
  2024-07-10  1:20       ` [PATCH v4 0/3] Independent Enqueue Support Abdullah Sevincer
                         ` (2 subsequent siblings)
  5 siblings, 1 reply; 99+ messages in thread
From: Mattias Rönnblom @ 2024-07-01  8:50 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Peter Nilsson, Maria Lingemark

On 2024-06-22 00:24, Abdullah Sevincer wrote:
> This commit adds support flag to enable enqueue reorder
> feature.
> 
> When this flag is enabled in the port configuration PMD
> restores dequeue order on enqueue if applications happen to
> change it.
> 
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> ---
>   lib/eventdev/rte_eventdev.h | 8 ++++++++
>   1 file changed, 8 insertions(+)
> 
> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> index 08e5f9320b..f4220dd5dc 100644
> --- a/lib/eventdev/rte_eventdev.h
> +++ b/lib/eventdev/rte_eventdev.h
> @@ -1073,6 +1073,14 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
>    *  @see rte_event_port_setup()
>    */
>   
> +#define RTE_EVENT_PORT_CFG_RESTORE_DEQ_ORDER   (1ULL << 5)
> +/**< Flag to enable feature enqueue reordering to dequeue.
> + * The feature restores dequeue order on enqueue if applications
> + * happen to change the order.
> + *

Add a device-level
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
as well.

The documentation of that flag should probably house the detailed 
description of this feature.

Here's how I would describe this feature:

#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)

/**< Flag to enable independent enqueue. Must be unset if the device
  * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
  * allows an application to enqueue RTE_EVENT_OP_FORWARD or
  * RTE_EVENT_OP_RELEASE in an order different than the order the
  * events were dequeued from the event device, while maintaining
  * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
  *
  * If the application wish to change the order of two events *within
  * a flow*, it must both change the enqueue order and exchange the
  * impl_opaque field, to be portable across all event devices.
  */

That second paragraph allows DSW to support this feature without 
modification, since this is the only difference between DSW-style 
independent enqueue, and DLB enqueue reordering. DLB will restore a 
total order, while DSW doesn't (since it would be both pointless and 
costly, given its design).

The upside with DSW-style implementation is that it's very simple and 
efficient, and does not impose any head-of-line blocking (which follows 
from restoring a total order between dequeue and enqueue). The downside 
is it does not allow for a scenario where a particular flow is split 
across different modules, the application performs reordering (e.g., 
with the dispatcher library) *and* wants to maintain ordering between 
events pertaining to those "sub flows". I've never come across such a 
scenario, but it may well exist.

If we fail to make DLB2 and DSW compatible, we'll probably need another 
flag for DSW, because needlessly imposing a total order DSW does not 
make a lot of sense.

You may want to add an example as well. And a note on the importance of 
maintaining impl_opaque between dequeue and enqueue.

> + *  @see rte_event_port_setup()
> + */
> +
>   /** Event port configuration structure */
>   struct rte_event_port_conf {
>   	int32_t new_event_threshold;

^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-07-01  8:50       ` Mattias Rönnblom
@ 2024-07-02 17:25         ` Pathak, Pravin
  2024-07-11  3:20           ` Pathak, Pravin
  0 siblings, 1 reply; 99+ messages in thread
From: Pathak, Pravin @ 2024-07-02 17:25 UTC (permalink / raw)
  To: Mattias Rönnblom, Sevincer, Abdullah, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish,
	Peter Nilsson, Maria Lingemark


>+ Add a device-level
>+ RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
>+ as well.
>+ The documentation of that flag should probably house the detailed description of this feature.

So, this capability will be advertised by DSW and DLB devices with detailed documentation. 

>+ Here's how I would describe this feature:

>+ #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)

>+/**< Flag to enable independent enqueue. Must be unset if the device
>+  * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
>+  * allows an application to enqueue RTE_EVENT_OP_FORWARD or
>+  * RTE_EVENT_OP_RELEASE in an order different than the order the
>+  * events were dequeued from the event device, while maintaining
>+  * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
>+  *
>+  * If the application wish to change the order of two events *within
>+  * a flow*, it must both change the enqueue order and exchange the
>+  * impl_opaque field, to be portable across all event devices.
>+  */

>+That second paragraph allows DSW to support this feature without modification since this is the only difference between DSW-style independent enqueue and DLB enqueue reordering. DLB will restore a total order, while DSW doesn't (since it would be both pointless and costly, given its design).

Can we skip mentioning this mechanism to change the order of any two events intentionally? For DLB, those two events should have been received from the same queue and, if atomic, with the same atomic flow ID. If there is no use case, we can skip it to avoid confusion. 

>+The upside with DSW-style implementation is that it's very simple and efficient, and does not impose any head-of-line blocking (which follows from restoring a total order between dequeue and enqueue). The downside is it does not allow for a scenario where a particular flow is split across different modules, the application performs reordering >+(e.g., with the dispatcher library) *and* wants to maintain ordering between events pertaining to those "sub flows". I've never come across such a scenario, but it may well exist.

>+If we fail to make DLB2 and DSW compatible, we'll probably need another flag for DSW, because needlessly imposing a total order DSW does not make a lot of sense.

If the device has the capability to support independent enqueue, it should enable it on applicable ports using the RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ  flag. Some devices like DSW will not do any reordering inside as they can support it without changing the order whereas devices like DLB which depend on order will reorder events inside their PMD.


>+You may want to add an example as well. And a note on the importance of maintaining impl_opaque between dequeue and enqueue.

We will consider this a separate patch later with an example based on the dispatcher library, which can work with DSW and DLB.  Is the port provided to rte_dispatcher_bind_port_to_lcore() already set up by the application? In that case configuring this feature on the port becomes part of the application. 

> + *  @see rte_event_port_setup()
> + */
> +
>   /** Event port configuration structure */
>   struct rte_event_port_conf {
>   	int32_t new_event_threshold;

^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v4 0/3] Independent Enqueue Support
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
                         ` (2 preceding siblings ...)
  2024-07-01  8:50       ` Mattias Rönnblom
@ 2024-07-10  1:20       ` Abdullah Sevincer
  2024-07-10  1:20         ` [PATCH v4 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
                           ` (2 more replies)
  2024-07-10  6:33       ` [PATCH v5 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-07-11 19:54       ` [PATCH v6 0/3] Independent Enqueue Support Abdullah Sevincer
  5 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  1:20 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  event/dlb2: add support for independent enqueue
  eventdev: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_07.rst    |  11 +
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  21 +
 10 files changed, 408 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v4 1/3] event/dlb2: add support for independent enqueue
  2024-07-10  1:20       ` [PATCH v4 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-07-10  1:20         ` Abdullah Sevincer
  2024-07-10  1:20         ` [PATCH v4 2/3] eventdev: " Abdullah Sevincer
  2024-07-10  1:20         ` [PATCH v4 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  1:20 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst |   6 +
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 6 files changed, 378 insertions(+), 190 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 50ffc1f74a..d018bcd0d5 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -156,6 +156,12 @@ New Features
   * Added defer queue reclamation via RCU.
   * Added SVE support for bulk lookup.
 
+* **Updated DLB2 Driver for independent enqueue feature**
+  * Added support for DLB independent enqueue feature. Applications should use
+    ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+    ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v4 2/3] eventdev: add support for independent enqueue
  2024-07-10  1:20       ` [PATCH v4 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-07-10  1:20         ` [PATCH v4 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-07-10  1:20         ` Abdullah Sevincer
  2024-07-10  1:20         ` [PATCH v4 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  1:20 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced
to support independent enqueue to support PMD to enqueue in any order
even the underlined hardware device needs enqueues in a strict dequeue
order.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_07.rst    |  4 +++-
 lib/eventdev/rte_eventdev.h               | 21 +++++++++++++++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index d018bcd0d5..6ad43c679d 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -161,7 +161,9 @@ New Features
     ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
     ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
-
+* **Updated Event Device Library for independent enqueue feature**
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..106b18fe28 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,17 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * When this flag is set, the application can enqueue to PMD in any order even
+ * the underlined hardware device needs enqueues in a strict dequeue order.
+ * PMD will reorder enqueued events based on their dequeue order when the feature
+ * is enabled on an Eventdev port. The capability supports applications that sort
+ * received bursts based on criteria like flow-id or receive QID and process them
+ * in smaller groups. Each processed group is enqueued separately, changing the
+ * dequeue order.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1083,16 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+ #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v4 3/3] event/dsw: add capability for independent enqueue
  2024-07-10  1:20       ` [PATCH v4 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-07-10  1:20         ` [PATCH v4 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-07-10  1:20         ` [PATCH v4 2/3] eventdev: " Abdullah Sevincer
@ 2024-07-10  1:20         ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  1:20 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst | 3 +++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 6ad43c679d..e97d0e0efa 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -165,6 +165,9 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DSW Driver for independent enqueue feature**
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 Removed Items
 -------------
 
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v5 0/3] Independent Enqueue Support
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
                         ` (3 preceding siblings ...)
  2024-07-10  1:20       ` [PATCH v4 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-07-10  6:33       ` Abdullah Sevincer
  2024-07-10  6:33         ` [PATCH v5 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
                           ` (2 more replies)
  2024-07-11 19:54       ` [PATCH v6 0/3] Independent Enqueue Support Abdullah Sevincer
  5 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  6:33 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v5: Address build issues
v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  event/dlb2: add support for independent enqueue
  eventdev: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_07.rst    |  15 +
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  21 +
 10 files changed, 412 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v5 1/3] event/dlb2: add support for independent enqueue
  2024-07-10  6:33       ` [PATCH v5 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-07-10  6:33         ` Abdullah Sevincer
  2024-07-10  6:33         ` [PATCH v5 2/3] eventdev: " Abdullah Sevincer
  2024-07-10  6:33         ` [PATCH v5 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  6:33 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst |   6 +
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 6 files changed, 378 insertions(+), 190 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 50ffc1f74a..76a79093d1 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -156,6 +156,12 @@ New Features
   * Added defer queue reclamation via RCU.
   * Added SVE support for bulk lookup.
 
+* **Updated DLB2 Driver for independent enqueue feature**
+
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v5 2/3] eventdev: add support for independent enqueue
  2024-07-10  6:33       ` [PATCH v5 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-07-10  6:33         ` [PATCH v5 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-07-10  6:33         ` Abdullah Sevincer
  2024-07-10  6:33         ` [PATCH v5 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  6:33 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced
to support independent enqueue to support PMD to enqueue in any order
even the underlined hardware device needs enqueues in a strict dequeue
order.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_07.rst    |  5 +++++
 lib/eventdev/rte_eventdev.h               | 21 +++++++++++++++++++++
 4 files changed, 28 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 76a79093d1..d8564b19f0 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -162,6 +162,11 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..106b18fe28 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,17 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * When this flag is set, the application can enqueue to PMD in any order even
+ * the underlined hardware device needs enqueues in a strict dequeue order.
+ * PMD will reorder enqueued events based on their dequeue order when the feature
+ * is enabled on an Eventdev port. The capability supports applications that sort
+ * received bursts based on criteria like flow-id or receive QID and process them
+ * in smaller groups. Each processed group is enqueued separately, changing the
+ * dequeue order.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1083,16 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+ #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v5 3/3] event/dsw: add capability for independent enqueue
  2024-07-10  6:33       ` [PATCH v5 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-07-10  6:33         ` [PATCH v5 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-07-10  6:33         ` [PATCH v5 2/3] eventdev: " Abdullah Sevincer
@ 2024-07-10  6:33         ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-10  6:33 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index d8564b19f0..c04f014943 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -167,6 +167,10 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v3 2/2] eventdev: add support for enqueue reorder
  2024-07-02 17:25         ` Pathak, Pravin
@ 2024-07-11  3:20           ` Pathak, Pravin
  0 siblings, 0 replies; 99+ messages in thread
From: Pathak, Pravin @ 2024-07-11  3:20 UTC (permalink / raw)
  To: Pathak, Pravin, Mattias Rönnblom, Sevincer, Abdullah, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish,
	Peter Nilsson, Maria Lingemark


>+ Add a device-level
>+ RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
>+ as well.
>+ The documentation of that flag should probably house the detailed description of this feature.

So, this capability will be advertised by DSW and DLB devices with detailed documentation. 
What about DPAA and OPDL devices supporting burst mode? Do these support independent enqueue ?



^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v6 0/3] Independent Enqueue Support
  2024-06-21 22:24     ` [PATCH v3 2/2] eventdev: add support for enqueue reorder Abdullah Sevincer
                         ` (4 preceding siblings ...)
  2024-07-10  6:33       ` [PATCH v5 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-07-11 19:54       ` Abdullah Sevincer
  2024-07-11 19:54         ` [PATCH v6 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
                           ` (2 more replies)
  5 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-11 19:54 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v6: Update patch with more documentation
v5: Address build issues
v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  event/dlb2: add support for independent enqueue
  eventdev: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/dlb2.rst             |  41 ++
 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_07.rst    |  15 +
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  36 ++
 11 files changed, 468 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v6 1/3] event/dlb2: add support for independent enqueue
  2024-07-11 19:54       ` [PATCH v6 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-07-11 19:54         ` Abdullah Sevincer
  2024-07-11 19:54         ` [PATCH v6 2/3] eventdev: " Abdullah Sevincer
  2024-07-11 19:54         ` [PATCH v6 " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-11 19:54 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/dlb2.rst          |  41 +++
 doc/guides/rel_notes/release_24_07.rst |   6 +
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 7 files changed, 419 insertions(+), 190 deletions(-)

diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
index 2532d92888..d41a4e4534 100644
--- a/doc/guides/eventdevs/dlb2.rst
+++ b/doc/guides/eventdevs/dlb2.rst
@@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
 
        --allow ea:00.0,enable_cq_weight=<y/Y>
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DLB2 hardware device expects all forwarded events to be enqueued in the same
+order as they are dequeued. For dropped events, their releases should come at
+the same location as the original event was expected. Hardware has this
+restriction as it uses the order to retrieve information about the original
+event that was sent to the CPU.  This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ `` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events. Following MACROs are provided to get
+and set this field inside the event in case the same event is not used for forwarding
+(e.g., a new RELEASE event is created when the original event is dropped instead of
+reusing the same event).
+
+    .. code-block:: c
+
+       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
+       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque = val)
+
 Running Eventdev Applications with DLB Device
 ---------------------------------------------
 
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 50ffc1f74a..76a79093d1 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -156,6 +156,12 @@ New Features
   * Added defer queue reclamation via RCU.
   * Added SVE support for bulk lookup.
 
+* **Updated DLB2 Driver for independent enqueue feature**
+
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v6 2/3] eventdev: add support for independent enqueue
  2024-07-11 19:54       ` [PATCH v6 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-07-11 19:54         ` [PATCH v6 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-07-11 19:54         ` Abdullah Sevincer
  2024-07-23  6:40           ` Mattias Rönnblom
                             ` (2 more replies)
  2024-07-11 19:54         ` [PATCH v6 " Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-11 19:54 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced
to support independent enqueue to support PMD to enqueue in any order
even the underlined hardware device needs enqueues in a strict dequeue
order.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_07.rst    |  5 ++++
 lib/eventdev/rte_eventdev.h               | 36 +++++++++++++++++++++++
 4 files changed, 43 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index 76a79093d1..d8564b19f0 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -162,6 +162,11 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..462fc3f18f 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,30 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The order restoration
+ * only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW) will
+ * be transmitted in the order the application enqueues them. New events do not
+ * maintain any order relative to FORWARD/RELEASE events.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1096,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+ #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v6 3/3] event/dsw: add capability for independent enqueue
  2024-07-11 19:54       ` [PATCH v6 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-07-11 19:54         ` [PATCH v6 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-07-11 19:54         ` [PATCH v6 2/3] eventdev: " Abdullah Sevincer
@ 2024-07-11 19:54         ` Abdullah Sevincer
  2024-07-23  6:41           ` Mattias Rönnblom
  2 siblings, 1 reply; 99+ messages in thread
From: Abdullah Sevincer @ 2024-07-11 19:54 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_07.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
index d8564b19f0..c04f014943 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -167,6 +167,10 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v6 2/3] eventdev: add support for independent enqueue
  2024-07-11 19:54         ` [PATCH v6 2/3] eventdev: " Abdullah Sevincer
@ 2024-07-23  6:40           ` Mattias Rönnblom
  2024-07-29 13:49             ` Pathak, Pravin
  2024-08-12 18:41           ` [PATCH v7 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-12 20:00           ` [PATCH v8 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 1 reply; 99+ messages in thread
From: Mattias Rönnblom @ 2024-07-23  6:40 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On 2024-07-11 21:54, Abdullah Sevincer wrote:
> This commit adds support for independent enqueue feature
> and updates Event Device and PMD feature list.
> 
> A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced
> to support independent enqueue to support PMD to enqueue in any order
> even the underlined hardware device needs enqueues in a strict dequeue
> order.
> 
> To use this capability applications need to set flag
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.
> 
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> ---
>   doc/guides/eventdevs/features/default.ini |  1 +
>   doc/guides/eventdevs/features/dlb2.ini    |  1 +
>   doc/guides/rel_notes/release_24_07.rst    |  5 ++++
>   lib/eventdev/rte_eventdev.h               | 36 +++++++++++++++++++++++
>   4 files changed, 43 insertions(+)
> 
> diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
> index 1cc4303fe5..7c4ee99238 100644
> --- a/doc/guides/eventdevs/features/default.ini
> +++ b/doc/guides/eventdevs/features/default.ini
> @@ -22,6 +22,7 @@ carry_flow_id              =
>   maintenance_free           =
>   runtime_queue_attr         =
>   profile_links              =
> +independent_enq            =
>   
>   ;
>   ; Features of a default Ethernet Rx adapter.
> diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
> index 7b80286927..c7193b47c1 100644
> --- a/doc/guides/eventdevs/features/dlb2.ini
> +++ b/doc/guides/eventdevs/features/dlb2.ini
> @@ -15,6 +15,7 @@ implicit_release_disable   = Y
>   runtime_port_link          = Y
>   multiple_queue_port        = Y
>   maintenance_free           = Y
> +independent_enq            = Y
>   
>   [Eth Rx adapter Features]
>   
> diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
> index 76a79093d1..d8564b19f0 100644
> --- a/doc/guides/rel_notes/release_24_07.rst
> +++ b/doc/guides/rel_notes/release_24_07.rst
> @@ -162,6 +162,11 @@ New Features
>     ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
>     ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
>   
> +* **Updated Event Device Library for independent enqueue feature**
> +
> +  * Added support for independent enqueue feature. Updated Event Device and
> +    PMD feature list.
> +
>   
>   Removed Items
>   -------------
> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> index 08e5f9320b..462fc3f18f 100644
> --- a/lib/eventdev/rte_eventdev.h
> +++ b/lib/eventdev/rte_eventdev.h
> @@ -446,6 +446,30 @@ struct rte_event;
>    * @see RTE_SCHED_TYPE_PARALLEL
>    */
>   
> +#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
> +/**< Event device is capable of independent enqueue.
> + * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
> + * supports the enqueue in any order or specifically in a different order than the
> + * dequeue. Eventdev PMD can either transmit events in the changed order in which
> + * they are enqueued or restore the original order before sending them to the
> + * underlying hardware device. A flag is provided during the port configuration to
> + * inform Eventdev PMD that the application intends to use an independent enqueue
> + * order on a particular port. Note that this capability only matters for Eventdevs
> + * supporting burst mode.
> + *
> + * To Inform PMD that the application plans to use independent enqueue order on a port
> + * this code example can be used:
> + *
> + *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> + *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
> + *
> + * When an implicit release is enabled on a port, Eventdev PMD will also handle
> + * the insertion of RELEASE events in place of dropped events. The order restoration

"The independent enqueue feature only applies to /../" maybe?

> + * only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW) will
> + * be transmitted in the order the application enqueues them. New events do not
> + * maintain any order relative to FORWARD/RELEASE events.

Is FORWARD<->NEW relaxed ordering specific to ports which has enabled 
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ?

If not, that information should probably be somewhere else.

Either way,
Reviewed-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>

> + */
> +
>   /* Event device priority levels */
>   #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
>   /**< Highest priority level for events and queues.
> @@ -1072,6 +1096,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
>    *
>    *  @see rte_event_port_setup()
>    */
> + #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
> +/**< Flag to enable independent enqueue. Must not be set if the device
> + * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
> + * allows an application to enqueue RTE_EVENT_OP_FORWARD or
> + * RTE_EVENT_OP_RELEASE in an order different than the order the
> + * events were dequeued from the event device, while maintaining
> + * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
> + *
> + * Note that this flag only matters for Eventdevs supporting burst mode.
> + *
> + *  @see rte_event_port_setup()
> + */
>   
>   /** Event port configuration structure */
>   struct rte_event_port_conf {

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v6 3/3] event/dsw: add capability for independent enqueue
  2024-07-11 19:54         ` [PATCH v6 " Abdullah Sevincer
@ 2024-07-23  6:41           ` Mattias Rönnblom
  0 siblings, 0 replies; 99+ messages in thread
From: Mattias Rönnblom @ 2024-07-23  6:41 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On 2024-07-11 21:54, Abdullah Sevincer wrote:
> To use independent enqueue capability applications need to set flag
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
> commit adds the capability of independent enqueue to the DSW driver.
> 
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> ---
>   doc/guides/rel_notes/release_24_07.rst | 4 ++++
>   drivers/event/dsw/dsw_evdev.c          | 3 ++-
>   2 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/rel_notes/release_24_07.rst b/doc/guides/rel_notes/release_24_07.rst
> index d8564b19f0..c04f014943 100644
> --- a/doc/guides/rel_notes/release_24_07.rst
> +++ b/doc/guides/rel_notes/release_24_07.rst
> @@ -167,6 +167,10 @@ New Features
>     * Added support for independent enqueue feature. Updated Event Device and
>       PMD feature list.
>   
> +* **Updated DSW Driver for independent enqueue feature**
> +
> +  * Added capability flag for DSW to advertise independent enqueue feature.
> +
>   
>   Removed Items
>   -------------
> diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
> index 0dea1091e3..5c483d869c 100644
> --- a/drivers/event/dsw/dsw_evdev.c
> +++ b/drivers/event/dsw/dsw_evdev.c
> @@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
>   		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
>   		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
>   		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
> -		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
> +		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
> +		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
>   	};
>   }
>   

Thanks!

Reviewed-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v6 2/3] eventdev: add support for independent enqueue
  2024-07-23  6:40           ` Mattias Rönnblom
@ 2024-07-29 13:49             ` Pathak, Pravin
  2024-08-13 15:00               ` Sevincer, Abdullah
  0 siblings, 1 reply; 99+ messages in thread
From: Pathak, Pravin @ 2024-07-29 13:49 UTC (permalink / raw)
  To: Mattias Rönnblom, Sevincer, Abdullah, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish

> On 2024-07-11 21:54, Abdullah Sevincer wrote:
> > This commit adds support for independent enqueue feature and updates
> > Event Device and PMD feature list.
> >
> > A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced to
> > support independent enqueue to support PMD to enqueue in any order
> > even the underlined hardware device needs enqueues in a strict dequeue
> > order.
> >
> > To use this capability applications need to set flag
> > RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> > capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.
> >
> > Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> > ---
> >   doc/guides/eventdevs/features/default.ini |  1 +
> >   doc/guides/eventdevs/features/dlb2.ini    |  1 +
> >   doc/guides/rel_notes/release_24_07.rst    |  5 ++++
> >   lib/eventdev/rte_eventdev.h               | 36 +++++++++++++++++++++++
> >   4 files changed, 43 insertions(+)
> >
> > diff --git a/doc/guides/eventdevs/features/default.ini
> > b/doc/guides/eventdevs/features/default.ini
> > index 1cc4303fe5..7c4ee99238 100644
> > --- a/doc/guides/eventdevs/features/default.ini
> > +++ b/doc/guides/eventdevs/features/default.ini
> > @@ -22,6 +22,7 @@ carry_flow_id              =
> >   maintenance_free           =
> >   runtime_queue_attr         =
> >   profile_links              =
> > +independent_enq            =
> >
> >   ;
> >   ; Features of a default Ethernet Rx adapter.
> > diff --git a/doc/guides/eventdevs/features/dlb2.ini
> > b/doc/guides/eventdevs/features/dlb2.ini
> > index 7b80286927..c7193b47c1 100644
> > --- a/doc/guides/eventdevs/features/dlb2.ini
> > +++ b/doc/guides/eventdevs/features/dlb2.ini
> > @@ -15,6 +15,7 @@ implicit_release_disable   = Y
> >   runtime_port_link          = Y
> >   multiple_queue_port        = Y
> >   maintenance_free           = Y
> > +independent_enq            = Y
> >
> >   [Eth Rx adapter Features]
> >
> > diff --git a/doc/guides/rel_notes/release_24_07.rst
> > b/doc/guides/rel_notes/release_24_07.rst
> > index 76a79093d1..d8564b19f0 100644
> > --- a/doc/guides/rel_notes/release_24_07.rst
> > +++ b/doc/guides/rel_notes/release_24_07.rst
> > @@ -162,6 +162,11 @@ New Features
> >     ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
> capability
> >     ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
> >
> > +* **Updated Event Device Library for independent enqueue feature**
> > +
> > +  * Added support for independent enqueue feature. Updated Event Device
> and
> > +    PMD feature list.
> > +
> >
> >   Removed Items
> >   -------------
> > diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> > index 08e5f9320b..462fc3f18f 100644
> > --- a/lib/eventdev/rte_eventdev.h
> > +++ b/lib/eventdev/rte_eventdev.h
> > @@ -446,6 +446,30 @@ struct rte_event;
> >    * @see RTE_SCHED_TYPE_PARALLEL
> >    */
> >
> > +#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16) /**< Event
> > +device is capable of independent enqueue.
> > + * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate
> > +that Eventdev
> > + * supports the enqueue in any order or specifically in a different
> > +order than the
> > + * dequeue. Eventdev PMD can either transmit events in the changed
> > +order in which
> > + * they are enqueued or restore the original order before sending
> > +them to the
> > + * underlying hardware device. A flag is provided during the port
> > +configuration to
> > + * inform Eventdev PMD that the application intends to use an
> > +independent enqueue
> > + * order on a particular port. Note that this capability only matters
> > +for Eventdevs
> > + * supporting burst mode.
> > + *
> > + * To Inform PMD that the application plans to use independent
> > +enqueue order on a port
> > + * this code example can be used:
> > + *
> > + *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> > + *     port_config = port_config |
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
> > + *
> > + * When an implicit release is enabled on a port, Eventdev PMD will
> > +also handle
> > + * the insertion of RELEASE events in place of dropped events. The
> > +order restoration
> 
> "The independent enqueue feature only applies to /../" maybe?
> 
> > + * only applies to FORWARD and RELEASE events. New events
> > + (op=RTE_EVENT_OP_NEW) will
> > + * be transmitted in the order the application enqueues them. New
> > + events do not
> > + * maintain any order relative to FORWARD/RELEASE events.
> 
> Is FORWARD<->NEW relaxed ordering specific to ports which has enabled
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ?

Yes. Relaxed ordering only applies to ports with independent enqueue enabled.
We will update the documentation. 

> If not, that information should probably be somewhere else.
> 
> Either way,
> Reviewed-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> 
> > + */
> > +
> >   /* Event device priority levels */
> >   #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
> >   /**< Highest priority level for events and queues.
> > @@ -1072,6 +1096,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t
> queue_id, uint32_t attr_id,
> >    *
> >    *  @see rte_event_port_setup()
> >    */
> > + #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
> > +/**< Flag to enable independent enqueue. Must not be set if the
> > +device
> > + * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
> > + * allows an application to enqueue RTE_EVENT_OP_FORWARD or
> > + * RTE_EVENT_OP_RELEASE in an order different than the order the
> > + * events were dequeued from the event device, while maintaining
> > + * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
> > + *
> > + * Note that this flag only matters for Eventdevs supporting burst mode.
> > + *
> > + *  @see rte_event_port_setup()
> > + */
> >
> >   /** Event port configuration structure */
> >   struct rte_event_port_conf {

^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v7 0/3] Independent Enqueue Support
  2024-07-11 19:54         ` [PATCH v6 2/3] eventdev: " Abdullah Sevincer
  2024-07-23  6:40           ` Mattias Rönnblom
@ 2024-08-12 18:41           ` Abdullah Sevincer
  2024-08-12 18:41             ` [PATCH v7 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
                               ` (2 more replies)
  2024-08-12 20:00           ` [PATCH v8 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 18:41 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v7: Address documentation reviews.
v6: Update patch with more documentation
v5: Address build issues
v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  event/dlb2: add support for independent enqueue
  eventdev: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/dlb2.rst             |  41 ++
 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_11.rst    |  34 +-
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  37 ++
 11 files changed, 463 insertions(+), 216 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v7 1/3] event/dlb2: add support for independent enqueue
  2024-08-12 18:41           ` [PATCH v7 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-08-12 18:41             ` Abdullah Sevincer
  2024-08-12 18:41             ` [PATCH v7 2/3] eventdev: " Abdullah Sevincer
  2024-08-12 18:41             ` [PATCH v7 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 18:41 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/dlb2.rst          |  41 +++
 doc/guides/rel_notes/release_24_11.rst |  33 +-
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 7 files changed, 417 insertions(+), 219 deletions(-)

diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
index 2532d92888..d74c6f7fd1 100644
--- a/doc/guides/eventdevs/dlb2.rst
+++ b/doc/guides/eventdevs/dlb2.rst
@@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
 
        --allow ea:00.0,enable_cq_weight=<y/Y>
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DLB2 hardware device expects all forwarded events to be enqueued in the same
+order as they are dequeued. For dropped events, their releases should come at
+the same location as the original event was expected. Hardware has this
+restriction as it uses the order to retrieve information about the original
+event that was sent to the CPU.  This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ``capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events. Following MACROs are provided to get
+and set this field inside the event in case the same event is not used for forwarding
+(e.g., a new RELEASE event is created when the original event is dropped instead of
+reusing the same event).
+
+    .. code-block:: c
+
+       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
+       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque = val)
+
 Running Eventdev Applications with DLB Device
 ---------------------------------------------
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 0ff70d9057..f0ec07c263 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -24,36 +24,11 @@ DPDK Release 24.11
 New Features
 ------------
 
-.. This section should contain new features added in this release.
-   Sample format:
+* **Updated DLB2 Driver for independent enqueue feature**
 
-   * **Add a title in the past tense with a full stop.**
-
-     Add a short 1-2 sentence description in the past tense.
-     The description should be enough to allow someone scanning
-     the release notes to understand the new feature.
-
-     If the feature adds a lot of sub-features you can use a bullet list
-     like this:
-
-     * Added feature foo to do something.
-     * Enhanced feature bar to do something else.
-
-     Refer to the previous release notes for examples.
-
-     Suggested order in release notes items:
-     * Core libs (EAL, mempool, ring, mbuf, buses)
-     * Device abstraction libs and PMDs (ordered alphabetically by vendor name)
-       - ethdev (lib, PMDs)
-       - cryptodev (lib, PMDs)
-       - eventdev (lib, PMDs)
-       - etc
-     * Other libs
-     * Apps, Examples, Tools (if significant)
-
-     This section is a comment. Do not overwrite or remove it.
-     Also, make sure to start the actual text at the margin.
-     =======================================================
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
 
 Removed Items
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v7 2/3] eventdev: add support for independent enqueue
  2024-08-12 18:41           ` [PATCH v7 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-12 18:41             ` [PATCH v7 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-08-12 18:41             ` Abdullah Sevincer
  2024-08-12 18:41             ` [PATCH v7 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 18:41 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced
to support independent enqueue to support PMD to enqueue in any order
even the underlined hardware device needs enqueues in a strict dequeue
order.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index f0ec07c263..04f389876a 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -30,6 +30,11 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..48e6eadda9 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+ #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v7 3/3] event/dsw: add capability for independent enqueue
  2024-08-12 18:41           ` [PATCH v7 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-12 18:41             ` [PATCH v7 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-08-12 18:41             ` [PATCH v7 2/3] eventdev: " Abdullah Sevincer
@ 2024-08-12 18:41             ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 18:41 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_11.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 04f389876a..b8d1f36e54 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -35,6 +35,10 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v8 0/3] Independent Enqueue Support
  2024-07-11 19:54         ` [PATCH v6 2/3] eventdev: " Abdullah Sevincer
  2024-07-23  6:40           ` Mattias Rönnblom
  2024-08-12 18:41           ` [PATCH v7 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-08-12 20:00           ` Abdullah Sevincer
  2024-08-12 20:00             ` [PATCH v8 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
                               ` (2 more replies)
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 20:00 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation
v5: Address build issues
v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  event/dlb2: add support for independent enqueue
  eventdev: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/dlb2.rst             |  41 ++
 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_11.rst    |  34 +-
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  37 ++
 11 files changed, 463 insertions(+), 216 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v8 1/3] event/dlb2: add support for independent enqueue
  2024-08-12 20:00           ` [PATCH v8 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-08-12 20:00             ` Abdullah Sevincer
  2024-08-12 20:00             ` [PATCH v8 2/3] eventdev: " Abdullah Sevincer
  2024-08-12 20:00             ` [PATCH v8 " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 20:00 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/dlb2.rst          |  41 +++
 doc/guides/rel_notes/release_24_11.rst |  33 +-
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 7 files changed, 417 insertions(+), 219 deletions(-)

diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
index 2532d92888..8b973cf81e 100644
--- a/doc/guides/eventdevs/dlb2.rst
+++ b/doc/guides/eventdevs/dlb2.rst
@@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
 
        --allow ea:00.0,enable_cq_weight=<y/Y>
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DLB2 hardware device expects all forwarded events to be enqueued in the same
+order as they are dequeued. For dropped events, their releases should come at
+the same location as the original event was expected. Hardware has this
+restriction as it uses the order to retrieve information about the original
+event that was sent to the CPU.  This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events. Following MACROs are provided to get
+and set this field inside the event in case the same event is not used for forwarding
+(e.g., a new RELEASE event is created when the original event is dropped instead of
+reusing the same event).
+
+    .. code-block:: c
+
+       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
+       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque = val)
+
 Running Eventdev Applications with DLB Device
 ---------------------------------------------
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 0ff70d9057..f0ec07c263 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -24,36 +24,11 @@ DPDK Release 24.11
 New Features
 ------------
 
-.. This section should contain new features added in this release.
-   Sample format:
+* **Updated DLB2 Driver for independent enqueue feature**
 
-   * **Add a title in the past tense with a full stop.**
-
-     Add a short 1-2 sentence description in the past tense.
-     The description should be enough to allow someone scanning
-     the release notes to understand the new feature.
-
-     If the feature adds a lot of sub-features you can use a bullet list
-     like this:
-
-     * Added feature foo to do something.
-     * Enhanced feature bar to do something else.
-
-     Refer to the previous release notes for examples.
-
-     Suggested order in release notes items:
-     * Core libs (EAL, mempool, ring, mbuf, buses)
-     * Device abstraction libs and PMDs (ordered alphabetically by vendor name)
-       - ethdev (lib, PMDs)
-       - cryptodev (lib, PMDs)
-       - eventdev (lib, PMDs)
-       - etc
-     * Other libs
-     * Apps, Examples, Tools (if significant)
-
-     This section is a comment. Do not overwrite or remove it.
-     Also, make sure to start the actual text at the margin.
-     =======================================================
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
 
 Removed Items
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-12 20:00           ` [PATCH v8 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-12 20:00             ` [PATCH v8 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-08-12 20:00             ` Abdullah Sevincer
  2024-08-23 11:02               ` Mattias Rönnblom
                                 ` (2 more replies)
  2024-08-12 20:00             ` [PATCH v8 " Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 20:00 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced
to support independent enqueue to support PMD to enqueue in any order
even the underlined hardware device needs enqueues in a strict dequeue
order.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index f0ec07c263..04f389876a 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -30,6 +30,11 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..48e6eadda9 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+ #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v8 3/3] event/dsw: add capability for independent enqueue
  2024-08-12 20:00           ` [PATCH v8 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-12 20:00             ` [PATCH v8 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-08-12 20:00             ` [PATCH v8 2/3] eventdev: " Abdullah Sevincer
@ 2024-08-12 20:00             ` Abdullah Sevincer
  2024-08-23 11:03               ` Mattias Rönnblom
  2 siblings, 1 reply; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-12 20:00 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_11.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 04f389876a..b8d1f36e54 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -35,6 +35,10 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v6 2/3] eventdev: add support for independent enqueue
  2024-07-29 13:49             ` Pathak, Pravin
@ 2024-08-13 15:00               ` Sevincer, Abdullah
  2024-08-20 16:48                 ` Sevincer, Abdullah
  0 siblings, 1 reply; 99+ messages in thread
From: Sevincer, Abdullah @ 2024-08-13 15:00 UTC (permalink / raw)
  To: Pathak, Pravin, Mattias Rönnblom, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish

Thanks,

I have addressed the documentation changes with patch set v8. If it all looks good let me know.

^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v6 2/3] eventdev: add support for independent enqueue
  2024-08-13 15:00               ` Sevincer, Abdullah
@ 2024-08-20 16:48                 ` Sevincer, Abdullah
  0 siblings, 0 replies; 99+ messages in thread
From: Sevincer, Abdullah @ 2024-08-20 16:48 UTC (permalink / raw)
  To: Sevincer, Abdullah, Pathak, Pravin, Mattias Rönnblom, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish

Hi Folks,

Any chance to look at the patches? Don’t want to lose the window for API changes again 😊


^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-12 20:00             ` [PATCH v8 2/3] eventdev: " Abdullah Sevincer
@ 2024-08-23 11:02               ` Mattias Rönnblom
  2024-08-24 20:41                 ` Pathak, Pravin
  2024-08-29 17:36               ` [PATCH v9 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-30 16:23               ` [PATCH v10 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 1 reply; 99+ messages in thread
From: Mattias Rönnblom @ 2024-08-23 11:02 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On 2024-08-12 22:00, Abdullah Sevincer wrote:
> This commit adds support for independent enqueue feature
> and updates Event Device and PMD feature list.
> 
> A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced
> to support independent enqueue to support PMD to enqueue in any order
> even the underlined hardware device needs enqueues in a strict dequeue

This sentence needs to be rephrased.

My attempt:
"A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. An 
application may, on an event device where independent enqueue is 
supported, using an event port where it is enabled, enqueue 
RTE_EVENT_OP_FORWARD or RELEASE type events in any order."

> order.
> 
> To use this capability applications need to set flag
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.
> 
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> ---
>   doc/guides/eventdevs/features/default.ini |  1 +
>   doc/guides/eventdevs/features/dlb2.ini    |  1 +
>   doc/guides/rel_notes/release_24_11.rst    |  5 +++
>   lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
>   4 files changed, 44 insertions(+)
> 
> diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
> index 1cc4303fe5..7c4ee99238 100644
> --- a/doc/guides/eventdevs/features/default.ini
> +++ b/doc/guides/eventdevs/features/default.ini
> @@ -22,6 +22,7 @@ carry_flow_id              =
>   maintenance_free           =
>   runtime_queue_attr         =
>   profile_links              =
> +independent_enq            =
>   
>   ;
>   ; Features of a default Ethernet Rx adapter.
> diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
> index 7b80286927..c7193b47c1 100644
> --- a/doc/guides/eventdevs/features/dlb2.ini
> +++ b/doc/guides/eventdevs/features/dlb2.ini
> @@ -15,6 +15,7 @@ implicit_release_disable   = Y
>   runtime_port_link          = Y
>   multiple_queue_port        = Y
>   maintenance_free           = Y
> +independent_enq            = Y
>   
>   [Eth Rx adapter Features]
>   
> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index f0ec07c263..04f389876a 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -30,6 +30,11 @@ New Features
>     ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
>     ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
>   
> +* **Updated Event Device Library for independent enqueue feature**
> +
> +  * Added support for independent enqueue feature. Updated Event Device and
> +    PMD feature list.
> +
>   
>   Removed Items
>   -------------
> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> index 08e5f9320b..48e6eadda9 100644
> --- a/lib/eventdev/rte_eventdev.h
> +++ b/lib/eventdev/rte_eventdev.h
> @@ -446,6 +446,31 @@ struct rte_event;
>    * @see RTE_SCHED_TYPE_PARALLEL
>    */
>   
> +#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
> +/**< Event device is capable of independent enqueue.
> + * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
> + * supports the enqueue in any order or specifically in a different order than the
> + * dequeue. Eventdev PMD can either transmit events in the changed order in which
> + * they are enqueued or restore the original order before sending them to the
> + * underlying hardware device. A flag is provided during the port configuration to
> + * inform Eventdev PMD that the application intends to use an independent enqueue
> + * order on a particular port. Note that this capability only matters for Eventdevs
> + * supporting burst mode.
> + *
> + * To Inform PMD that the application plans to use independent enqueue order on a port
> + * this code example can be used:
> + *
> + *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> + *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
> + *
> + * When an implicit release is enabled on a port, Eventdev PMD will also handle
> + * the insertion of RELEASE events in place of dropped events. The independent enqueue
> + * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
> + * will be transmitted in the order the application enqueues them and do not maintain
> + * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
> + * only applies to ports that have enabled independent enqueue feature.
> + */
> +
>   /* Event device priority levels */
>   #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
>   /**< Highest priority level for events and queues.
> @@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
>    *
>    *  @see rte_event_port_setup()
>    */
> + #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
> +/**< Flag to enable independent enqueue. Must not be set if the device
> + * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
> + * allows an application to enqueue RTE_EVENT_OP_FORWARD or
> + * RTE_EVENT_OP_RELEASE in an order different than the order the
> + * events were dequeued from the event device, while maintaining
> + * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
> + *
> + * Note that this flag only matters for Eventdevs supporting burst mode.
> + *
> + *  @see rte_event_port_setup()
> + */
>   
>   /** Event port configuration structure */
>   struct rte_event_port_conf {

Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v8 3/3] event/dsw: add capability for independent enqueue
  2024-08-12 20:00             ` [PATCH v8 " Abdullah Sevincer
@ 2024-08-23 11:03               ` Mattias Rönnblom
  0 siblings, 0 replies; 99+ messages in thread
From: Mattias Rönnblom @ 2024-08-23 11:03 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On 2024-08-12 22:00, Abdullah Sevincer wrote:
> To use independent enqueue capability applications need to set flag
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
> commit adds the capability of independent enqueue to the DSW driver.
> 
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> ---
>   doc/guides/rel_notes/release_24_11.rst | 4 ++++
>   drivers/event/dsw/dsw_evdev.c          | 3 ++-
>   2 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index 04f389876a..b8d1f36e54 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -35,6 +35,10 @@ New Features
>     * Added support for independent enqueue feature. Updated Event Device and
>       PMD feature list.
>   
> +* **Updated DSW Driver for independent enqueue feature**
> +
> +  * Added capability flag for DSW to advertise independent enqueue feature.
> +
>   
>   Removed Items
>   -------------
> diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
> index 0dea1091e3..5c483d869c 100644
> --- a/drivers/event/dsw/dsw_evdev.c
> +++ b/drivers/event/dsw/dsw_evdev.c
> @@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
>   		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
>   		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
>   		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
> -		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
> +		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
> +		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
>   	};
>   }
>   

Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>


^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-23 11:02               ` Mattias Rönnblom
@ 2024-08-24 20:41                 ` Pathak, Pravin
  2024-08-27 18:33                   ` Sevincer, Abdullah
  2024-08-28 16:45                   ` Mattias Rönnblom
  0 siblings, 2 replies; 99+ messages in thread
From: Pathak, Pravin @ 2024-08-24 20:41 UTC (permalink / raw)
  To: Mattias Rönnblom, Sevincer, Abdullah, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish



> -----Original Message-----
> From: Mattias Rönnblom <hofors@lysator.liu.se>
> Sent: Friday, August 23, 2024 7:03 AM
> To: Sevincer, Abdullah <abdullah.sevincer@intel.com>; dev@dpdk.org
> Cc: jerinj@marvell.com; Richardson, Bruce <bruce.richardson@intel.com>;
> Pathak, Pravin <pravin.pathak@intel.com>; mattias.ronnblom@ericsson.com;
> Aggarwal, Manish <manish.aggarwal@intel.com>
> Subject: Re: [PATCH v8 2/3] eventdev: add support for independent enqueue
> 
> On 2024-08-12 22:00, Abdullah Sevincer wrote:
> > This commit adds support for independent enqueue feature and updates
> > Event Device and PMD feature list.
> >
> > A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced to
> > support independent enqueue to support PMD to enqueue in any order
> > even the underlined hardware device needs enqueues in a strict dequeue
> 
> This sentence needs to be rephrased.
> 
> My attempt:
> "A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. An
> application may, on an event device where independent enqueue is supported,
> using an event port where it is enabled, enqueue RTE_EVENT_OP_FORWARD or
> RELEASE type events in any order."
> 
> > order.

Will this work: 
A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, is introduced. It 
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type 
events on an event port where this capability is enabled. 

> >
> > To use this capability applications need to set flag
> > RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> > capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.
> >
> > Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> > ---
> >   doc/guides/eventdevs/features/default.ini |  1 +
> >   doc/guides/eventdevs/features/dlb2.ini    |  1 +
> >   doc/guides/rel_notes/release_24_11.rst    |  5 +++
> >   lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
> >   4 files changed, 44 insertions(+)
> >
> > diff --git a/doc/guides/eventdevs/features/default.ini
> > b/doc/guides/eventdevs/features/default.ini
> > index 1cc4303fe5..7c4ee99238 100644
> > --- a/doc/guides/eventdevs/features/default.ini
> > +++ b/doc/guides/eventdevs/features/default.ini
> > @@ -22,6 +22,7 @@ carry_flow_id              =
> >   maintenance_free           =
> >   runtime_queue_attr         =
> >   profile_links              =
> > +independent_enq            =
> >
> >   ;
> >   ; Features of a default Ethernet Rx adapter.
> > diff --git a/doc/guides/eventdevs/features/dlb2.ini
> > b/doc/guides/eventdevs/features/dlb2.ini
> > index 7b80286927..c7193b47c1 100644
> > --- a/doc/guides/eventdevs/features/dlb2.ini
> > +++ b/doc/guides/eventdevs/features/dlb2.ini
> > @@ -15,6 +15,7 @@ implicit_release_disable   = Y
> >   runtime_port_link          = Y
> >   multiple_queue_port        = Y
> >   maintenance_free           = Y
> > +independent_enq            = Y
> >
> >   [Eth Rx adapter Features]
> >
> > diff --git a/doc/guides/rel_notes/release_24_11.rst
> > b/doc/guides/rel_notes/release_24_11.rst
> > index f0ec07c263..04f389876a 100644
> > --- a/doc/guides/rel_notes/release_24_11.rst
> > +++ b/doc/guides/rel_notes/release_24_11.rst
> > @@ -30,6 +30,11 @@ New Features
> >     ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
> capability
> >     ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
> >
> > +* **Updated Event Device Library for independent enqueue feature**
> > +
> > +  * Added support for independent enqueue feature. Updated Event Device
> and
> > +    PMD feature list.
> > +
> >
> >   Removed Items
> >   -------------
> > diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> > index 08e5f9320b..48e6eadda9 100644
> > --- a/lib/eventdev/rte_eventdev.h
> > +++ b/lib/eventdev/rte_eventdev.h
> > @@ -446,6 +446,31 @@ struct rte_event;
> >    * @see RTE_SCHED_TYPE_PARALLEL
> >    */
> >
> > +#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16) /**< Event
> > +device is capable of independent enqueue.
> > + * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate
> > +that Eventdev
> > + * supports the enqueue in any order or specifically in a different
> > +order than the
> > + * dequeue. Eventdev PMD can either transmit events in the changed
> > +order in which
> > + * they are enqueued or restore the original order before sending
> > +them to the
> > + * underlying hardware device. A flag is provided during the port
> > +configuration to
> > + * inform Eventdev PMD that the application intends to use an
> > +independent enqueue
> > + * order on a particular port. Note that this capability only matters
> > +for Eventdevs
> > + * supporting burst mode.
> > + *
> > + * To Inform PMD that the application plans to use independent
> > +enqueue order on a port
> > + * this code example can be used:
> > + *
> > + *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> > + *     port_config = port_config |
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
> > + *
> > + * When an implicit release is enabled on a port, Eventdev PMD will
> > +also handle
> > + * the insertion of RELEASE events in place of dropped events. The
> > +independent enqueue
> > + * feature only applies to FORWARD and RELEASE events. New events
> > +(op=RTE_EVENT_OP_NEW)
> > + * will be transmitted in the order the application enqueues them and
> > +do not maintain
> > + * any order relative to FORWARD/RELEASE events. FORWARD vs NEW
> > +relaxed ordering
> > + * only applies to ports that have enabled independent enqueue feature.
> > + */
> > +
> >   /* Event device priority levels */
> >   #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
> >   /**< Highest priority level for events and queues.
> > @@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t
> queue_id, uint32_t attr_id,
> >    *
> >    *  @see rte_event_port_setup()
> >    */
> > + #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
> > +/**< Flag to enable independent enqueue. Must not be set if the
> > +device
> > + * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
> > + * allows an application to enqueue RTE_EVENT_OP_FORWARD or
> > + * RTE_EVENT_OP_RELEASE in an order different than the order the
> > + * events were dequeued from the event device, while maintaining
> > + * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
> > + *
> > + * Note that this flag only matters for Eventdevs supporting burst mode.
> > + *
> > + *  @see rte_event_port_setup()
> > + */
> >
> >   /** Event port configuration structure */
> >   struct rte_event_port_conf {
> 
> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-24 20:41                 ` Pathak, Pravin
@ 2024-08-27 18:33                   ` Sevincer, Abdullah
  2024-08-28 16:45                   ` Mattias Rönnblom
  1 sibling, 0 replies; 99+ messages in thread
From: Sevincer, Abdullah @ 2024-08-27 18:33 UTC (permalink / raw)
  To: Pathak, Pravin, Mattias Rönnblom, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish

Hi Mattias,
I will update patch tomorrow with updated suggestion from Pravin, If I don’t hear from you I guess you are okay?


^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-24 20:41                 ` Pathak, Pravin
  2024-08-27 18:33                   ` Sevincer, Abdullah
@ 2024-08-28 16:45                   ` Mattias Rönnblom
  2024-08-28 16:59                     ` Sevincer, Abdullah
  1 sibling, 1 reply; 99+ messages in thread
From: Mattias Rönnblom @ 2024-08-28 16:45 UTC (permalink / raw)
  To: Pathak, Pravin, Sevincer, Abdullah, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish

On 2024-08-24 22:41, Pathak, Pravin wrote:
> 
> 
>> -----Original Message-----
>> From: Mattias Rönnblom <hofors@lysator.liu.se>
>> Sent: Friday, August 23, 2024 7:03 AM
>> To: Sevincer, Abdullah <abdullah.sevincer@intel.com>; dev@dpdk.org
>> Cc: jerinj@marvell.com; Richardson, Bruce <bruce.richardson@intel.com>;
>> Pathak, Pravin <pravin.pathak@intel.com>; mattias.ronnblom@ericsson.com;
>> Aggarwal, Manish <manish.aggarwal@intel.com>
>> Subject: Re: [PATCH v8 2/3] eventdev: add support for independent enqueue
>>
>> On 2024-08-12 22:00, Abdullah Sevincer wrote:
>>> This commit adds support for independent enqueue feature and updates
>>> Event Device and PMD feature list.
>>>
>>> A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced to
>>> support independent enqueue to support PMD to enqueue in any order
>>> even the underlined hardware device needs enqueues in a strict dequeue
>>
>> This sentence needs to be rephrased.
>>
>> My attempt:
>> "A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. An
>> application may, on an event device where independent enqueue is supported,
>> using an event port where it is enabled, enqueue RTE_EVENT_OP_FORWARD or
>> RELEASE type events in any order."
>>
>>> order.
> 
> Will this work:
> A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, is introduced. It
> allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
> events on an event port where this capability is enabled.
> 

Sounds good and better than my attempt.

>>>
>>> To use this capability applications need to set flag
>>> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
>>> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.
>>>
>>> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
>>> ---
>>>    doc/guides/eventdevs/features/default.ini |  1 +
>>>    doc/guides/eventdevs/features/dlb2.ini    |  1 +
>>>    doc/guides/rel_notes/release_24_11.rst    |  5 +++
>>>    lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
>>>    4 files changed, 44 insertions(+)
>>>
>>> diff --git a/doc/guides/eventdevs/features/default.ini
>>> b/doc/guides/eventdevs/features/default.ini
>>> index 1cc4303fe5..7c4ee99238 100644
>>> --- a/doc/guides/eventdevs/features/default.ini
>>> +++ b/doc/guides/eventdevs/features/default.ini
>>> @@ -22,6 +22,7 @@ carry_flow_id              =
>>>    maintenance_free           =
>>>    runtime_queue_attr         =
>>>    profile_links              =
>>> +independent_enq            =
>>>
>>>    ;
>>>    ; Features of a default Ethernet Rx adapter.
>>> diff --git a/doc/guides/eventdevs/features/dlb2.ini
>>> b/doc/guides/eventdevs/features/dlb2.ini
>>> index 7b80286927..c7193b47c1 100644
>>> --- a/doc/guides/eventdevs/features/dlb2.ini
>>> +++ b/doc/guides/eventdevs/features/dlb2.ini
>>> @@ -15,6 +15,7 @@ implicit_release_disable   = Y
>>>    runtime_port_link          = Y
>>>    multiple_queue_port        = Y
>>>    maintenance_free           = Y
>>> +independent_enq            = Y
>>>
>>>    [Eth Rx adapter Features]
>>>
>>> diff --git a/doc/guides/rel_notes/release_24_11.rst
>>> b/doc/guides/rel_notes/release_24_11.rst
>>> index f0ec07c263..04f389876a 100644
>>> --- a/doc/guides/rel_notes/release_24_11.rst
>>> +++ b/doc/guides/rel_notes/release_24_11.rst
>>> @@ -30,6 +30,11 @@ New Features
>>>      ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
>> capability
>>>      ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
>>>
>>> +* **Updated Event Device Library for independent enqueue feature**
>>> +
>>> +  * Added support for independent enqueue feature. Updated Event Device
>> and
>>> +    PMD feature list.
>>> +
>>>
>>>    Removed Items
>>>    -------------
>>> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
>>> index 08e5f9320b..48e6eadda9 100644
>>> --- a/lib/eventdev/rte_eventdev.h
>>> +++ b/lib/eventdev/rte_eventdev.h
>>> @@ -446,6 +446,31 @@ struct rte_event;
>>>     * @see RTE_SCHED_TYPE_PARALLEL
>>>     */
>>>
>>> +#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16) /**< Event
>>> +device is capable of independent enqueue.
>>> + * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate
>>> +that Eventdev
>>> + * supports the enqueue in any order or specifically in a different
>>> +order than the
>>> + * dequeue. Eventdev PMD can either transmit events in the changed
>>> +order in which
>>> + * they are enqueued or restore the original order before sending
>>> +them to the
>>> + * underlying hardware device. A flag is provided during the port
>>> +configuration to
>>> + * inform Eventdev PMD that the application intends to use an
>>> +independent enqueue
>>> + * order on a particular port. Note that this capability only matters
>>> +for Eventdevs
>>> + * supporting burst mode.
>>> + *
>>> + * To Inform PMD that the application plans to use independent
>>> +enqueue order on a port
>>> + * this code example can be used:
>>> + *
>>> + *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
>>> + *     port_config = port_config |
>> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
>>> + *
>>> + * When an implicit release is enabled on a port, Eventdev PMD will
>>> +also handle
>>> + * the insertion of RELEASE events in place of dropped events. The
>>> +independent enqueue
>>> + * feature only applies to FORWARD and RELEASE events. New events
>>> +(op=RTE_EVENT_OP_NEW)
>>> + * will be transmitted in the order the application enqueues them and
>>> +do not maintain
>>> + * any order relative to FORWARD/RELEASE events. FORWARD vs NEW
>>> +relaxed ordering
>>> + * only applies to ports that have enabled independent enqueue feature.
>>> + */
>>> +
>>>    /* Event device priority levels */
>>>    #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
>>>    /**< Highest priority level for events and queues.
>>> @@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t
>> queue_id, uint32_t attr_id,
>>>     *
>>>     *  @see rte_event_port_setup()
>>>     */
>>> + #define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
>>> +/**< Flag to enable independent enqueue. Must not be set if the
>>> +device
>>> + * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
>>> + * allows an application to enqueue RTE_EVENT_OP_FORWARD or
>>> + * RTE_EVENT_OP_RELEASE in an order different than the order the
>>> + * events were dequeued from the event device, while maintaining
>>> + * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
>>> + *
>>> + * Note that this flag only matters for Eventdevs supporting burst mode.
>>> + *
>>> + *  @see rte_event_port_setup()
>>> + */
>>>
>>>    /** Event port configuration structure */
>>>    struct rte_event_port_conf {
>>
>> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-28 16:45                   ` Mattias Rönnblom
@ 2024-08-28 16:59                     ` Sevincer, Abdullah
  2024-08-29 12:51                       ` Jerin Jacob
  0 siblings, 1 reply; 99+ messages in thread
From: Sevincer, Abdullah @ 2024-08-28 16:59 UTC (permalink / raw)
  To: Mattias Rönnblom, Pathak, Pravin, dev
  Cc: jerinj, Richardson, Bruce, mattias.ronnblom, Aggarwal, Manish

Thanks Mattias,

Hi Jerin,

Are you okay with the changes so far? 

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-28 16:59                     ` Sevincer, Abdullah
@ 2024-08-29 12:51                       ` Jerin Jacob
  2024-08-31 18:38                         ` Sevincer, Abdullah
  0 siblings, 1 reply; 99+ messages in thread
From: Jerin Jacob @ 2024-08-29 12:51 UTC (permalink / raw)
  To: Sevincer, Abdullah
  Cc: Mattias Rönnblom, Pathak, Pravin, dev, jerinj, Richardson,
	Bruce, mattias.ronnblom, Aggarwal, Manish

On Wed, Aug 28, 2024 at 10:29 PM Sevincer, Abdullah
<abdullah.sevincer@intel.com> wrote:
>
> Thanks Mattias,
>
> Hi Jerin,
>
> Are you okay with the changes so far?

The overall outlook is OK. Send next version with release note changes
from 24.07 to 24.11.

^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v9 0/3] Independent Enqueue Support
  2024-08-12 20:00             ` [PATCH v8 2/3] eventdev: " Abdullah Sevincer
  2024-08-23 11:02               ` Mattias Rönnblom
@ 2024-08-29 17:36               ` Abdullah Sevincer
  2024-08-29 17:36                 ` [PATCH v9 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
                                   ` (2 more replies)
  2024-08-30 16:23               ` [PATCH v10 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-29 17:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation
v5: Address build issues
v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  event/dlb2: add support for independent enqueue
  eventdev: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/dlb2.rst             |  41 ++
 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_11.rst    |  34 +-
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  37 ++
 11 files changed, 463 insertions(+), 216 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v9 1/3] event/dlb2: add support for independent enqueue
  2024-08-29 17:36               ` [PATCH v9 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-08-29 17:36                 ` Abdullah Sevincer
  2024-08-29 17:36                 ` [PATCH v9 2/3] eventdev: " Abdullah Sevincer
  2024-08-29 17:36                 ` [PATCH v9 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-29 17:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/dlb2.rst          |  41 +++
 doc/guides/rel_notes/release_24_11.rst |  33 +-
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 7 files changed, 417 insertions(+), 219 deletions(-)

diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
index 2532d92888..8b973cf81e 100644
--- a/doc/guides/eventdevs/dlb2.rst
+++ b/doc/guides/eventdevs/dlb2.rst
@@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
 
        --allow ea:00.0,enable_cq_weight=<y/Y>
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DLB2 hardware device expects all forwarded events to be enqueued in the same
+order as they are dequeued. For dropped events, their releases should come at
+the same location as the original event was expected. Hardware has this
+restriction as it uses the order to retrieve information about the original
+event that was sent to the CPU.  This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events. Following MACROs are provided to get
+and set this field inside the event in case the same event is not used for forwarding
+(e.g., a new RELEASE event is created when the original event is dropped instead of
+reusing the same event).
+
+    .. code-block:: c
+
+       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
+       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque = val)
+
 Running Eventdev Applications with DLB Device
 ---------------------------------------------
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 0ff70d9057..f0ec07c263 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -24,36 +24,11 @@ DPDK Release 24.11
 New Features
 ------------
 
-.. This section should contain new features added in this release.
-   Sample format:
+* **Updated DLB2 Driver for independent enqueue feature**
 
-   * **Add a title in the past tense with a full stop.**
-
-     Add a short 1-2 sentence description in the past tense.
-     The description should be enough to allow someone scanning
-     the release notes to understand the new feature.
-
-     If the feature adds a lot of sub-features you can use a bullet list
-     like this:
-
-     * Added feature foo to do something.
-     * Enhanced feature bar to do something else.
-
-     Refer to the previous release notes for examples.
-
-     Suggested order in release notes items:
-     * Core libs (EAL, mempool, ring, mbuf, buses)
-     * Device abstraction libs and PMDs (ordered alphabetically by vendor name)
-       - ethdev (lib, PMDs)
-       - cryptodev (lib, PMDs)
-       - eventdev (lib, PMDs)
-       - etc
-     * Other libs
-     * Apps, Examples, Tools (if significant)
-
-     This section is a comment. Do not overwrite or remove it.
-     Also, make sure to start the actual text at the margin.
-     =======================================================
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
 
 Removed Items
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v9 2/3] eventdev: add support for independent enqueue
  2024-08-29 17:36               ` [PATCH v9 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-29 17:36                 ` [PATCH v9 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-08-29 17:36                 ` Abdullah Sevincer
  2024-08-29 17:36                 ` [PATCH v9 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-29 17:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index f0ec07c263..04f389876a 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -30,6 +30,11 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..3e3142d4a6 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v9 3/3] event/dsw: add capability for independent enqueue
  2024-08-29 17:36               ` [PATCH v9 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-29 17:36                 ` [PATCH v9 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-08-29 17:36                 ` [PATCH v9 2/3] eventdev: " Abdullah Sevincer
@ 2024-08-29 17:36                 ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-29 17:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
---
 doc/guides/rel_notes/release_24_11.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 04f389876a..b8d1f36e54 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -35,6 +35,10 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v10 0/3] Independent Enqueue Support
  2024-08-12 20:00             ` [PATCH v8 2/3] eventdev: " Abdullah Sevincer
  2024-08-23 11:02               ` Mattias Rönnblom
  2024-08-29 17:36               ` [PATCH v9 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-08-30 16:23               ` Abdullah Sevincer
  2024-08-30 16:23                 ` [PATCH v10 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
                                   ` (2 more replies)
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-30 16:23 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v10: Add acked-by reviewer name.
v9: Address comments.
v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation
v5: Address build issues
v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  event/dlb2: add support for independent enqueue
  eventdev: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/dlb2.rst             |  41 ++
 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_11.rst    |  34 +-
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  37 ++
 11 files changed, 463 insertions(+), 216 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v10 1/3] event/dlb2: add support for independent enqueue
  2024-08-30 16:23               ` [PATCH v10 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-08-30 16:23                 ` Abdullah Sevincer
  2024-09-09  1:47                   ` fengchengwen
                                     ` (2 more replies)
  2024-08-30 16:23                 ` [PATCH v10 2/3] eventdev: add support " Abdullah Sevincer
  2024-08-30 16:23                 ` [PATCH v10 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-30 16:23 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/dlb2.rst          |  41 +++
 doc/guides/rel_notes/release_24_11.rst |  33 +-
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 7 files changed, 417 insertions(+), 219 deletions(-)

diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
index 2532d92888..8b973cf81e 100644
--- a/doc/guides/eventdevs/dlb2.rst
+++ b/doc/guides/eventdevs/dlb2.rst
@@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
 
        --allow ea:00.0,enable_cq_weight=<y/Y>
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DLB2 hardware device expects all forwarded events to be enqueued in the same
+order as they are dequeued. For dropped events, their releases should come at
+the same location as the original event was expected. Hardware has this
+restriction as it uses the order to retrieve information about the original
+event that was sent to the CPU.  This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events. Following MACROs are provided to get
+and set this field inside the event in case the same event is not used for forwarding
+(e.g., a new RELEASE event is created when the original event is dropped instead of
+reusing the same event).
+
+    .. code-block:: c
+
+       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
+       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque = val)
+
 Running Eventdev Applications with DLB Device
 ---------------------------------------------
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 0ff70d9057..f0ec07c263 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -24,36 +24,11 @@ DPDK Release 24.11
 New Features
 ------------
 
-.. This section should contain new features added in this release.
-   Sample format:
+* **Updated DLB2 Driver for independent enqueue feature**
 
-   * **Add a title in the past tense with a full stop.**
-
-     Add a short 1-2 sentence description in the past tense.
-     The description should be enough to allow someone scanning
-     the release notes to understand the new feature.
-
-     If the feature adds a lot of sub-features you can use a bullet list
-     like this:
-
-     * Added feature foo to do something.
-     * Enhanced feature bar to do something else.
-
-     Refer to the previous release notes for examples.
-
-     Suggested order in release notes items:
-     * Core libs (EAL, mempool, ring, mbuf, buses)
-     * Device abstraction libs and PMDs (ordered alphabetically by vendor name)
-       - ethdev (lib, PMDs)
-       - cryptodev (lib, PMDs)
-       - eventdev (lib, PMDs)
-       - etc
-     * Other libs
-     * Apps, Examples, Tools (if significant)
-
-     This section is a comment. Do not overwrite or remove it.
-     Also, make sure to start the actual text at the margin.
-     =======================================================
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
 
 Removed Items
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v10 2/3] eventdev: add support for independent enqueue
  2024-08-30 16:23               ` [PATCH v10 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-30 16:23                 ` [PATCH v10 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-08-30 16:23                 ` Abdullah Sevincer
  2024-08-30 16:23                 ` [PATCH v10 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-30 16:23 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index f0ec07c263..04f389876a 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -30,6 +30,11 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..3e3142d4a6 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v10 3/3] event/dsw: add capability for independent enqueue
  2024-08-30 16:23               ` [PATCH v10 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-08-30 16:23                 ` [PATCH v10 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-08-30 16:23                 ` [PATCH v10 2/3] eventdev: add support " Abdullah Sevincer
@ 2024-08-30 16:23                 ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-08-30 16:23 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/rel_notes/release_24_11.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 04f389876a..b8d1f36e54 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -35,6 +35,10 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v8 2/3] eventdev: add support for independent enqueue
  2024-08-29 12:51                       ` Jerin Jacob
@ 2024-08-31 18:38                         ` Sevincer, Abdullah
  0 siblings, 0 replies; 99+ messages in thread
From: Sevincer, Abdullah @ 2024-08-31 18:38 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Mattias Rönnblom, Pathak, Pravin, dev, jerinj, Richardson,
	Bruce, mattias.ronnblom, Aggarwal, Manish


>+The overall outlook is OK. Send next version with release note changes from 24.07 to 24.11.
Thanks Jerin, I sent the new version patch v10.

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v10 1/3] event/dlb2: add support for independent enqueue
  2024-08-30 16:23                 ` [PATCH v10 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
@ 2024-09-09  1:47                   ` fengchengwen
  2024-09-16 17:51                     ` Sevincer, Abdullah
  2024-09-09 15:52                   ` [PATCH v11 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-09-09 16:05                   ` [PATCH v12 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 1 reply; 99+ messages in thread
From: fengchengwen @ 2024-09-09  1:47 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

This commit should after "[PATCH v10 2/3] eventdev: add support for independent enqueue"
because this commit use the macro which defined in later commit. Suggest order:
1. lib's commit
2. driver's commits

On 2024/8/31 0:23, Abdullah Sevincer wrote:
> DLB devices need events to be enqueued in the same order they are
> dequeued. Applications are not suppose to change event order between
> dequeue and to enqueue. Since Eventdev standard does not add such
> restrictions independent enqueue support is needed for DLB PMD so that
> it restores dequeue order on enqueue if applications happen to change
> it. It also adds missing releases in places where events are dropped
> by the application and it expects implicit release to handle it.
> 
> By default the feature will be off on all DLB ports and they will
> behave the same as older releases. To enable reordering feature,
> applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
> to port configuration if only the device advertises the capability
> RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.
> 
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---

...

>  New Features
>  ------------
>  
> -.. This section should contain new features added in this release.
> -   Sample format:
> +* **Updated DLB2 Driver for independent enqueue feature**
>  
> -   * **Add a title in the past tense with a full stop.**
> -
> -     Add a short 1-2 sentence description in the past tense.
> -     The description should be enough to allow someone scanning
> -     the release notes to understand the new feature.
> -
> -     If the feature adds a lot of sub-features you can use a bullet list
> -     like this:
> -
> -     * Added feature foo to do something.
> -     * Enhanced feature bar to do something else.
> -
> -     Refer to the previous release notes for examples.
> -
> -     Suggested order in release notes items:
> -     * Core libs (EAL, mempool, ring, mbuf, buses)
> -     * Device abstraction libs and PMDs (ordered alphabetically by vendor name)
> -       - ethdev (lib, PMDs)
> -       - cryptodev (lib, PMDs)
> -       - eventdev (lib, PMDs)
> -       - etc
> -     * Other libs
> -     * Apps, Examples, Tools (if significant)
> -
> -     This section is a comment. Do not overwrite or remove it.
> -     Also, make sure to start the actual text at the margin.
> -     =======================================================

The above line will remove when DPDK 24.11 released, please don't remove it when developing.

...

^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v11 0/3] Independent Enqueue Support
  2024-08-30 16:23                 ` [PATCH v10 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-09-09  1:47                   ` fengchengwen
@ 2024-09-09 15:52                   ` Abdullah Sevincer
  2024-09-09 15:52                     ` [PATCH v11 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
                                       ` (2 more replies)
  2024-09-09 16:05                   ` [PATCH v12 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 15:52 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v11: Address comments.
v10: Add acked-by reviewer name.
v9: Address comments.
v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation
v5: Address build issues
v4: Address comments
v3: Fix CI/build issues
v2: Fix CI/build issues
v1: Initial patchset

Abdullah Sevincer (3):
  eventdev: add support for independent enqueue
  event/dlb2: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/dlb2.rst             |  41 ++
 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_11.rst    |  15 +
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  37 ++
 11 files changed, 469 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v11 1/3] eventdev: add support for independent enqueue
  2024-09-09 15:52                   ` [PATCH v11 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-09-09 15:52                     ` Abdullah Sevincer
  2024-09-19 10:32                       ` Jerin Jacob
  2024-09-09 15:52                     ` [PATCH v11 2/3] event/dlb2: " Abdullah Sevincer
  2024-09-09 15:52                     ` [PATCH v11 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 1 reply; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 15:52 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 0ff70d9057..89c6a67e6a 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -55,6 +55,11 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..3e3142d4a6 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v11 2/3] event/dlb2: add support for independent enqueue
  2024-09-09 15:52                   ` [PATCH v11 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-09-09 15:52                     ` [PATCH v11 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-09-09 15:52                     ` Abdullah Sevincer
  2024-09-09 15:52                     ` [PATCH v11 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 15:52 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/dlb2.rst          |  41 +++
 doc/guides/rel_notes/release_24_11.rst |   6 +
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 7 files changed, 419 insertions(+), 190 deletions(-)

diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
index 2532d92888..8b973cf81e 100644
--- a/doc/guides/eventdevs/dlb2.rst
+++ b/doc/guides/eventdevs/dlb2.rst
@@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
 
        --allow ea:00.0,enable_cq_weight=<y/Y>
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DLB2 hardware device expects all forwarded events to be enqueued in the same
+order as they are dequeued. For dropped events, their releases should come at
+the same location as the original event was expected. Hardware has this
+restriction as it uses the order to retrieve information about the original
+event that was sent to the CPU.  This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events. Following MACROs are provided to get
+and set this field inside the event in case the same event is not used for forwarding
+(e.g., a new RELEASE event is created when the original event is dropped instead of
+reusing the same event).
+
+    .. code-block:: c
+
+       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
+       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque = val)
+
 Running Eventdev Applications with DLB Device
 ---------------------------------------------
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 89c6a67e6a..dff77596f0 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -60,6 +60,12 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DLB2 Driver for independent enqueue feature**
+ 
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v11 3/3] event/dsw: add capability for independent enqueue
  2024-09-09 15:52                   ` [PATCH v11 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-09-09 15:52                     ` [PATCH v11 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
  2024-09-09 15:52                     ` [PATCH v11 2/3] event/dlb2: " Abdullah Sevincer
@ 2024-09-09 15:52                     ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 15:52 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/rel_notes/release_24_11.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index dff77596f0..69338b33b9 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -66,6 +66,10 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v12 0/3] Independent Enqueue Support
  2024-08-30 16:23                 ` [PATCH v10 1/3] event/dlb2: add support for independent enqueue Abdullah Sevincer
  2024-09-09  1:47                   ` fengchengwen
  2024-09-09 15:52                   ` [PATCH v11 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-09-09 16:05                   ` Abdullah Sevincer
  2024-09-09 16:05                     ` [PATCH v12 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
                                       ` (2 more replies)
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 16:05 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v12: Address comments.
v11: Address comments.
v10: Add acked-by reviewer name.
v9: Address comments.
v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation.
v5: Address build issues.
v4: Address comments.
v3: Fix CI/build issues.
v2: Fix CI/build issues.
v1: Initial patchset.

Abdullah Sevincer (3):
  eventdev: add support for independent enqueue
  event/dlb2: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/dlb2.rst             |  41 ++
 doc/guides/eventdevs/features/default.ini |   1 +
 doc/guides/eventdevs/features/dlb2.ini    |   1 +
 doc/guides/rel_notes/release_24_11.rst    |  15 +
 drivers/event/dlb2/dlb2.c                 | 492 ++++++++++++++--------
 drivers/event/dlb2/dlb2_avx512.c          |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h      |   8 +
 drivers/event/dlb2/dlb2_priv.h            |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h         |  10 +
 drivers/event/dsw/dsw_evdev.c             |   3 +-
 lib/eventdev/rte_eventdev.h               |  37 ++
 11 files changed, 469 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v12 1/3] eventdev: add support for independent enqueue
  2024-09-09 16:05                   ` [PATCH v12 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-09-09 16:05                     ` Abdullah Sevincer
  2024-09-09 16:05                     ` [PATCH v12 2/3] event/dlb2: " Abdullah Sevincer
  2024-09-09 16:05                     ` [PATCH v12 " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 16:05 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 0ff70d9057..89c6a67e6a 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -55,6 +55,11 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..3e3142d4a6 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v12 2/3] event/dlb2: add support for independent enqueue
  2024-09-09 16:05                   ` [PATCH v12 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-09-09 16:05                     ` [PATCH v12 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-09-09 16:05                     ` Abdullah Sevincer
  2024-09-19 10:49                       ` [EXTERNAL] " Jerin Jacob
                                         ` (2 more replies)
  2024-09-09 16:05                     ` [PATCH v12 " Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 16:05 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/dlb2.rst          |  41 +++
 doc/guides/rel_notes/release_24_11.rst |   6 +
 drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
 drivers/event/dlb2/dlb2_avx512.c       |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
 drivers/event/dlb2/dlb2_priv.h         |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
 7 files changed, 419 insertions(+), 190 deletions(-)

diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
index 2532d92888..8b973cf81e 100644
--- a/doc/guides/eventdevs/dlb2.rst
+++ b/doc/guides/eventdevs/dlb2.rst
@@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
 
        --allow ea:00.0,enable_cq_weight=<y/Y>
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DLB2 hardware device expects all forwarded events to be enqueued in the same
+order as they are dequeued. For dropped events, their releases should come at
+the same location as the original event was expected. Hardware has this
+restriction as it uses the order to retrieve information about the original
+event that was sent to the CPU.  This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events. Following MACROs are provided to get
+and set this field inside the event in case the same event is not used for forwarding
+(e.g., a new RELEASE event is created when the original event is dropped instead of
+reusing the same event).
+
+    .. code-block:: c
+
+       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
+       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque = val)
+
 Running Eventdev Applications with DLB Device
 ---------------------------------------------
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 89c6a67e6a..61796f24e6 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -60,6 +60,12 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+* **Updated DLB2 Driver for independent enqueue feature**
+
+  Added support for DLB independent enqueue feature. Applications should use
+  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
+  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 0b91f03956..c3e929c917 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -52,6 +52,7 @@
 #if (RTE_EVENT_MAX_QUEUES_PER_DEV > UINT8_MAX)
 #error "RTE_EVENT_MAX_QUEUES_PER_DEV cannot fit in member max_event_queues"
 #endif
+
 static struct rte_event_dev_info evdev_dlb2_default_info = {
 	.driver_name = "", /* probe will set */
 	.min_dequeue_timeout_ns = DLB2_MIN_DEQUEUE_TIMEOUT_NS,
@@ -82,6 +83,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +100,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +311,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2\n",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2\n",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1452,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory\n");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d\n", ret);
@@ -1541,13 +1559,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d\n",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1631,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1955,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d\n",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2003,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,12 +2811,36 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
 {
 	dlb2_movdir64b(port_data->pp_addr, qe4);
 }
 
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
+}
+
 static inline int
 dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 {
@@ -2815,7 +2858,8 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
+
 
 	DLB2_LOG_DBG("dlb2: consume immediate - %d QEs\n", num);
 
@@ -2835,7 +2879,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3030,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3206,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3586,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3622,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3630,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3809,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3828,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3883,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3914,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3929,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4276,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4284,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4327,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4335,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4388,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4415,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5089,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..7daebfa583 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,16 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macros to get/set QID depth and QE weight from rte_event metadata.
+ * Currently 'rsvd' field is used for these. Lower 2 bits are used to store
+ * QID depth while the upper 2 bits are used for QER weight.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v12 3/3] event/dsw: add capability for independent enqueue
  2024-09-09 16:05                   ` [PATCH v12 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-09-09 16:05                     ` [PATCH v12 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
  2024-09-09 16:05                     ` [PATCH v12 2/3] event/dlb2: " Abdullah Sevincer
@ 2024-09-09 16:05                     ` Abdullah Sevincer
  2024-09-19 10:49                       ` Jerin Jacob
  2 siblings, 1 reply; 99+ messages in thread
From: Abdullah Sevincer @ 2024-09-09 16:05 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/rel_notes/release_24_11.rst | 4 ++++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 61796f24e6..c505215ae1 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -66,6 +66,10 @@ New Features
   ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
   ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+* **Updated DSW Driver for independent enqueue feature**
+
+  * Added capability flag for DSW to advertise independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 0dea1091e3..5c483d869c 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [PATCH v10 1/3] event/dlb2: add support for independent enqueue
  2024-09-09  1:47                   ` fengchengwen
@ 2024-09-16 17:51                     ` Sevincer, Abdullah
  0 siblings, 0 replies; 99+ messages in thread
From: Sevincer, Abdullah @ 2024-09-16 17:51 UTC (permalink / raw)
  To: fengchengwen, dev
  Cc: jerinj, Richardson, Bruce, Pathak, Pravin, mattias.ronnblom,
	Aggarwal, Manish


>+This commit should after "[PATCH v10 2/3] eventdev: add support for independent enqueue"
>+because this commit use the macro which defined in later commit. Suggest order:
>+1. lib's commit
>+2. driver's commits
Thanks this is addressed with v12

>+The above line will remove when DPDK 24.11 released, please don't remove it when developing.

Thanks this is addressed with v12

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v11 1/3] eventdev: add support for independent enqueue
  2024-09-09 15:52                     ` [PATCH v11 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-09-19 10:32                       ` Jerin Jacob
  0 siblings, 0 replies; 99+ messages in thread
From: Jerin Jacob @ 2024-09-19 10:32 UTC (permalink / raw)
  To: Abdullah Sevincer
  Cc: dev, jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On Tue, Sep 10, 2024 at 9:08 AM Abdullah Sevincer
<abdullah.sevincer@intel.com> wrote:
>
> This commit adds support for independent enqueue feature
> and updates Event Device and PMD feature list.
>
> A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
> allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
> events on an event port where this capability is enabled.
>
> To use this capability applications need to set flag
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.
>
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>


Good to merge next version. Some minor comments.


> ---
>  doc/guides/eventdevs/features/default.ini |  1 +
>  doc/guides/eventdevs/features/dlb2.ini    |  1 +
>  doc/guides/rel_notes/release_24_11.rst    |  5 +++
>  lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
>  4 files changed, 44 insertions(+)
>
> diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
> index 1cc4303fe5..7c4ee99238 100644
> --- a/doc/guides/eventdevs/features/default.ini
> +++ b/doc/guides/eventdevs/features/default.ini
> @@ -22,6 +22,7 @@ carry_flow_id              =
>  maintenance_free           =
>  runtime_queue_attr         =
>  profile_links              =
> +independent_enq            =
>
>  ;
>  ; Features of a default Ethernet Rx adapter.
> diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
> index 7b80286927..c7193b47c1 100644
> --- a/doc/guides/eventdevs/features/dlb2.ini
> +++ b/doc/guides/eventdevs/features/dlb2.ini
> @@ -15,6 +15,7 @@ implicit_release_disable   = Y
>  runtime_port_link          = Y
>  multiple_queue_port        = Y
>  maintenance_free           = Y
> +independent_enq            = Y
>
>  [Eth Rx adapter Features]
>
> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index 0ff70d9057..89c6a67e6a 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -55,6 +55,11 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
>
> +* **Updated Event Device Library for independent enqueue feature**

Updated eventdev to support independent enqueue feature
i.e Change Event Device Library to eventdev

> +
> +  * Added support for independent enqueue feature. Updated Event Device and

i.e Change Event Device Library to eventdev

> +    PMD feature list.
> +
>
>  Removed Items
>  -------------
> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> index 08e5f9320b..3e3142d4a6 100644
> --- a/lib/eventdev/rte_eventdev.h
> +++ b/lib/eventdev/rte_eventdev.h
> @@ -446,6 +446,31 @@ struct rte_event;
>   * @see RTE_SCHED_TYPE_PARALLEL
>   */
>
> +#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
> +/**< Event device is capable of independent enqueue.
> + * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev

Eventdev->eventdev


> + * supports the enqueue in any order or specifically in a different order than the
> + * dequeue. Eventdev PMD can either transmit events in the changed order in which

transmit->enqueue

> + * they are enqueued or restore the original order before sending them to the
> + * underlying hardware device. A flag is provided during the port configuration to
> + * inform Eventdev PMD that the application intends to use an independent enqueue
> + * order on a particular port. Note that this capability only matters for Eventdevs

Eventdevs->eventdevs

> + * supporting burst mode.
> + *
> + * To Inform PMD that the application plans to use independent enqueue order on a port

Inform->inform

> + * this code example can be used:
> + *
> + *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> + *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
> + *
> + * When an implicit release is enabled on a port, Eventdev PMD will also handle
> + * the insertion of RELEASE events in place of dropped events. The independent enqueue
> + * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
> + * will be transmitted in the order the application enqueues them and do not maintain

transmitted->enqueued


> + * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
> + * only applies to ports that have enabled independent enqueue feature.
> + */
> +
>  /* Event device priority levels */
>  #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
>  /**< Highest priority level for events and queues.
> @@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
>   *
>   *  @see rte_event_port_setup()
>   */
> +#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
> +/**< Flag to enable independent enqueue. Must not be set if the device
> + * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
> + * allows an application to enqueue RTE_EVENT_OP_FORWARD or
> + * RTE_EVENT_OP_RELEASE in an order different than the order the
> + * events were dequeued from the event device, while maintaining
> + * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
> + *
> + * Note that this flag only matters for Eventdevs supporting burst mode.

Eventdevs->eventdevs

> + *
> + *  @see rte_event_port_setup()
> + */
>
>  /** Event port configuration structure */
>  struct rte_event_port_conf {
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* RE: [EXTERNAL] [PATCH v12 2/3] event/dlb2: add support for independent enqueue
  2024-09-09 16:05                     ` [PATCH v12 2/3] event/dlb2: " Abdullah Sevincer
@ 2024-09-19 10:49                       ` Jerin Jacob
  2024-10-03 20:36                       ` [PATCH v13 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-03 20:49                       ` [PATCH v14 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Jerin Jacob @ 2024-09-19 10:49 UTC (permalink / raw)
  To: Abdullah Sevincer, dev
  Cc: bruce.richardson, pravin.pathak, mattias.ronnblom, manish.aggarwal

> -----Original Message-----
> From: Abdullah Sevincer <abdullah.sevincer@intel.com>
> Sent: Monday, September 9, 2024 9:35 PM
> To: dev@dpdk.org
> Cc: Jerin Jacob <jerinj@marvell.com>; bruce.richardson@intel.com;
> pravin.pathak@intel.com; mattias.ronnblom@ericsson.com;
> manish.aggarwal@intel.com; Abdullah Sevincer <abdullah.sevincer@intel.com>
> Subject: [EXTERNAL] [PATCH v12 2/3] event/dlb2: add support for independent
> enqueue
> 
> DLB devices need events to be enqueued in the same order they are dequeued.
> Applications are not suppose to change event order between dequeue and to
> enqueue. Since Eventdev standard does not add such restrictions independent
> enqueue support 
> DLB devices need events to be enqueued in the same order they are dequeued.
> Applications are not suppose to change event order between dequeue and to
> enqueue. Since Eventdev standard does not add such restrictions independent
> enqueue support is needed for DLB PMD so that it restores dequeue order on
> enqueue if applications happen to change it. It also adds missing releases in
> places where events are dropped by the application and it expects implicit
> release to handle it.
> 
> By default the feature will be off on all DLB ports and they will behave the same
> as older releases. To enable reordering feature, applications need to add the
> flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
> to port configuration if only the device advertises the capability
> RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.
> 
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>  doc/guides/eventdevs/dlb2.rst          |  41 +++
>  doc/guides/rel_notes/release_24_11.rst |   6 +
>  drivers/event/dlb2/dlb2.c              | 492 ++++++++++++++++---------
>  drivers/event/dlb2/dlb2_avx512.c       |  27 +-
>  drivers/event/dlb2/dlb2_inline_fns.h   |   8 +
>  drivers/event/dlb2/dlb2_priv.h         |  25 +-
>  drivers/event/dlb2/rte_pmd_dlb2.h      |  10 +
>  7 files changed, 419 insertions(+), 190 deletions(-)
> 
> diff --git a/doc/guides/eventdevs/dlb2.rst b/doc/guides/eventdevs/dlb2.rst
> index 2532d92888..8b973cf81e 100644
> --- a/doc/guides/eventdevs/dlb2.rst
> +++ b/doc/guides/eventdevs/dlb2.rst
> @@ -456,6 +456,47 @@ Example command to enable QE Weight feature:
> 
>         --allow ea:00.0,enable_cq_weight=<y/Y>
> 
> +Independent Enqueue Capability
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +
> +DLB2 hardware device expects all forwarded events to be enqueued in the
> +same order as they are dequeued. For dropped events, their releases
> +should come at the same location as the original event was expected.
> +Hardware has this restriction as it uses the order to retrieve
> +information about the original event that was sent to the CPU.  This
> +contains information like atomic flow ID to release the flow lock and
> +ordered events sequence number to restore the original order.
> +
> +Some applications, like those based on the DPDK dispatcher library,
> +want enqueue order independence. To support this, DLB2 PMD supports the
> +``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
> +
> +This capability applies to Eventdevs supporting burst mode. On ports
> +where the application is going to change enqueue order,
> +``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
> +
> +Example code to inform PMD that the application plans to use
> +independent enqueue order on a port:
> +
> +    .. code-block:: c
> +
> +       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> +         port_config = port_config |
> + RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;


A lot of thing in this is generic, update and move that to  doc/guides/prog_guide/eventdev/eventdev.rst
And add DLB2 specific items here.


> +
> +This code example enables enqueue event reordering inside DLB2 PMD
> +before the events are sent to the DLB2 hardware. If the application is
> +not going to change the enqueue order, this flag should not be enabled
> +to get better performance. DLB2 PMD saves ordering information inside
> +the impl_opaque field of the event, and this field should be preserved
> +for all FORWARD or RELEASE events. Following MACROs are provided to get
> +and set this field inside the event in case the same event is not used
> +for forwarding (e.g., a new RELEASE event is created when the original event is
> dropped instead of reusing the same event).
> +
> +    .. code-block:: c
> +
> +       #define RTE_EVENT_GET_IMPL_OPAQUE(ev)      (ev->impl_opaque)
> +       #define RTE_EVENT_SET_IMPL_OPAQUE(ev, val)  (ev->impl_opaque =
> + val)

Don’t add public RTE_EVENT_ symbol as example macro, use some other prefix like DLB2.


> +
>  Running Eventdev Applications with DLB Device
>  ---------------------------------------------
> 
> diff --git a/doc/guides/rel_notes/release_24_11.rst
> b/doc/guides/rel_notes/release_24_11.rst
> index 89c6a67e6a..61796f24e6 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -60,6 +60,12 @@ New Features
>    * Added support for independent enqueue feature. Updated Event Device and
>      PMD feature list.
> 
> +* **Updated DLB2 Driver for independent enqueue feature**

Driver->driver

> +
> +  Added support for DLB independent enqueue feature. Applications
> + should use  ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the
> + feature if the capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.

Next patch add one more item to release notes. Instead, IMO, it better to update the generic section.
for this PMD support. I.e. add one item under 
"*Updated eventdev to support independent enqueue feature" section. As add DLB2 support.


> 
>  #include <rte_compat.h>
> 
> +/**
> + * Macros to get/set QID depth and QE weight from rte_event metadata.
> + * Currently 'rsvd' field is used for these. Lower 2 bits are used to
> +store
> + * QID depth while the upper 2 bits are used for QER weight.
> + */
> +#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3) #define
> +RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v &
> +0x3)) #define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
> +#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3)
> +| ((v & 0x3) << 2))


All of the above MACROs are public symbols, Add Doxygen comment for each of them.

> +
>  /**
>   * @warning
>   * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v12 3/3] event/dsw: add capability for independent enqueue
  2024-09-09 16:05                     ` [PATCH v12 " Abdullah Sevincer
@ 2024-09-19 10:49                       ` Jerin Jacob
  0 siblings, 0 replies; 99+ messages in thread
From: Jerin Jacob @ 2024-09-19 10:49 UTC (permalink / raw)
  To: Abdullah Sevincer
  Cc: dev, jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On Mon, Sep 9, 2024 at 9:43 PM Abdullah Sevincer
<abdullah.sevincer@intel.com> wrote:
>
> To use independent enqueue capability applications need to set flag
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
> commit adds the capability of independent enqueue to the DSW driver.
>
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>  doc/guides/rel_notes/release_24_11.rst | 4 ++++
>  drivers/event/dsw/dsw_evdev.c          | 3 ++-
>  2 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index 61796f24e6..c505215ae1 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -66,6 +66,10 @@ New Features
>    ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the capability
>    ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
>
> +* **Updated DSW Driver for independent enqueue feature**
> +
> +  * Added capability flag for DSW to advertise independent enqueue feature.

Same documenation comment as 2/3 patch.

> +
>
>  Removed Items
>  -------------
> diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
> index 0dea1091e3..5c483d869c 100644
> --- a/drivers/event/dsw/dsw_evdev.c
> +++ b/drivers/event/dsw/dsw_evdev.c
> @@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
>                 RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
>                 RTE_EVENT_DEV_CAP_NONSEQ_MODE|
>                 RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
> -               RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
> +               RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
> +               RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
>         };
>  }
>
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v13 0/3] Independent Enqueue Support
  2024-09-09 16:05                     ` [PATCH v12 2/3] event/dlb2: " Abdullah Sevincer
  2024-09-19 10:49                       ` [EXTERNAL] " Jerin Jacob
@ 2024-10-03 20:36                       ` Abdullah Sevincer
  2024-10-03 20:36                         ` [PATCH v13 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
                                           ` (2 more replies)
  2024-10-03 20:49                       ` [PATCH v14 0/3] Independent Enqueue Support Abdullah Sevincer
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v13: Address comments.
v12: Address comments.
v11: Address comments.
v10: Add acked-by reviewer name.
v9: Address comments.
v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation.
v5: Address build issues.
v4: Address comments.
v3: Fix CI/build issues.
v2: Fix CI/build issues.
v1: Initial patchset.

Abdullah Sevincer (3):
  eventdev: add support for independent enqueue
  event/dlb2: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/features/default.ini   |   1 +
 doc/guides/eventdevs/features/dlb2.ini      |   1 +
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |  11 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 drivers/event/dsw/dsw_evdev.c               |   3 +-
 lib/eventdev/rte_eventdev.h                 |  37 ++
 11 files changed, 469 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v13 1/3] eventdev: add support for independent enqueue
  2024-10-03 20:36                       ` [PATCH v13 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-10-03 20:36                         ` Abdullah Sevincer
  2024-10-03 20:36                         ` [PATCH v13 2/3] event/dlb2: " Abdullah Sevincer
  2024-10-03 20:36                         ` [PATCH v13 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index e0a9aa55a1..dee6723b70 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -67,6 +67,11 @@ New Features
 
   The new statistics are useful for debugging and profiling.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..3e3142d4a6 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v13 2/3] event/dlb2: add support for independent enqueue
  2024-10-03 20:36                       ` [PATCH v13 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-03 20:36                         ` [PATCH v13 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-10-03 20:36                         ` Abdullah Sevincer
  2024-10-05  7:02                           ` Jerin Jacob
  2024-10-03 20:36                         ` [PATCH v13 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 1 reply; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |   5 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 7 files changed, 422 insertions(+), 190 deletions(-)

diff --git a/doc/guides/prog_guide/eventdev/eventdev.rst b/doc/guides/prog_guide/eventdev/eventdev.rst
index fb6dfce102..dd22ab69d2 100644
--- a/doc/guides/prog_guide/eventdev/eventdev.rst
+++ b/doc/guides/prog_guide/eventdev/eventdev.rst
@@ -472,6 +472,39 @@ A flush callback can be passed to the function to handle any outstanding events.
 
         Invocation of this API does not affect the existing port configuration.
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some eventdev hardware devices such as DLB2 expects all forwarded events to be 
+enqueued in the same order as they are dequeued. For dropped events, their 
+releases should come at the same location as the original event was expected.
+Hardware has this restriction as it uses the order to retrieve information about
+the original event that was sent to the CPU. This contains information like atomic flow
+ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events.
+
 Stopping the EventDev
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index dee6723b70..98e9732100 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -72,6 +72,11 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+  * Updated DLB2 driver for independent enqueue feature. Applications should
+    use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
+    capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index c43ab864ca..09e4107824 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -82,6 +82,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +99,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +310,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1451,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d", ret);
@@ -1541,13 +1558,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1630,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1954,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2002,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,10 +2810,34 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
+{
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+}
+
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
 {
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
 	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
 }
 
 static inline int
@@ -2815,7 +2857,7 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
 
 	DLB2_LOG_LINE_DBG("dlb2: consume immediate - %d QEs", num);
 
@@ -2835,7 +2877,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3028,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3204,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3584,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3620,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3628,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3807,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3826,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3881,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3912,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3927,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4274,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4282,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4325,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4333,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4386,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4413,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5087,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..564b4f18c6 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,30 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macro function to get QID depth of rte_event metadata.
+ * Currently lower 2 bits of 'rsvd' field are used to store QID depth.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+
+/**
+ * Macro function to set QID depth of rte_event metadata.
+ * Currently lower 2 bits of 'rsvd' field are used to store QID depth.
+ */
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+
+/**
+ * Macro function to get QE weight from rte_event metadata.
+ * Currently upper 2 bits of 'rsvd' field are used to store QE weight.
+ */
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+
+/**
+ * Macro function to set QE weight from rte_event metadata.
+ * Currently upper 2 bits of 'rsvd' field are used to store QE weight.
+ */
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v13 3/3] event/dsw: add capability for independent enqueue
  2024-10-03 20:36                       ` [PATCH v13 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-03 20:36                         ` [PATCH v13 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
  2024-10-03 20:36                         ` [PATCH v13 2/3] event/dlb2: " Abdullah Sevincer
@ 2024-10-03 20:36                         ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:36 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/rel_notes/release_24_11.rst | 1 +
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 98e9732100..4e4ca4fc23 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -76,6 +76,7 @@ New Features
     use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
     capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+  * Updated DSW driver for independent enqueue feature.
 
 
 Removed Items
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 8a1a2db8ac..9fb187bc74 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v14 0/3] Independent Enqueue Support
  2024-09-09 16:05                     ` [PATCH v12 2/3] event/dlb2: " Abdullah Sevincer
  2024-09-19 10:49                       ` [EXTERNAL] " Jerin Jacob
  2024-10-03 20:36                       ` [PATCH v13 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-10-03 20:49                       ` Abdullah Sevincer
  2024-10-03 20:50                         ` [PATCH v14 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
                                           ` (2 more replies)
  2 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:49 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v14: Address comments.
v13: Address comments.
v12: Address comments.
v11: Address comments.
v10: Add acked-by reviewer name.
v9: Address comments.
v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation.
v5: Address build issues.
v4: Address comments.
v3: Fix CI/build issues.
v2: Fix CI/build issues.
v1: Initial patchset.

Abdullah Sevincer (3):
  eventdev: add support for independent enqueue
  event/dlb2: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/features/default.ini   |   1 +
 doc/guides/eventdevs/features/dlb2.ini      |   1 +
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |  11 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 drivers/event/dsw/dsw_evdev.c               |   3 +-
 lib/eventdev/rte_eventdev.h                 |  37 ++
 11 files changed, 469 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v14 1/3] eventdev: add support for independent enqueue
  2024-10-03 20:49                       ` [PATCH v14 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-10-03 20:50                         ` Abdullah Sevincer
  2024-10-05  6:51                           ` Jerin Jacob
                                             ` (4 more replies)
  2024-10-03 20:50                         ` [PATCH v14 2/3] event/dlb2: add support " Abdullah Sevincer
  2024-10-03 20:50                         ` [PATCH v14 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 5 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:50 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

This commit adds support for independent enqueue feature
and updates Event Device and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  5 +++
 lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index e0a9aa55a1..dee6723b70 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -67,6 +67,11 @@ New Features
 
   The new statistics are useful for debugging and profiling.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. Updated Event Device and
+    PMD feature list.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..3e3142d4a6 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,31 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either transmit events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for Eventdevs
+ * supporting burst mode.
+ *
+ * To Inform PMD that the application plans to use independent enqueue order on a port
+ * this code example can be used:
+ *
+ *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+ *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be transmitted in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v14 2/3] event/dlb2: add support for independent enqueue
  2024-10-03 20:49                       ` [PATCH v14 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-03 20:50                         ` [PATCH v14 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-10-03 20:50                         ` Abdullah Sevincer
  2024-10-03 20:50                         ` [PATCH v14 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:50 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |   5 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 7 files changed, 422 insertions(+), 190 deletions(-)

diff --git a/doc/guides/prog_guide/eventdev/eventdev.rst b/doc/guides/prog_guide/eventdev/eventdev.rst
index fb6dfce102..801e970021 100644
--- a/doc/guides/prog_guide/eventdev/eventdev.rst
+++ b/doc/guides/prog_guide/eventdev/eventdev.rst
@@ -472,6 +472,39 @@ A flush callback can be passed to the function to handle any outstanding events.
 
         Invocation of this API does not affect the existing port configuration.
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some eventdev hardware devices such as DLB2 expects all forwarded events to be
+enqueued in the same order as they are dequeued. For dropped events, their
+releases should come at the same location as the original event was expected.
+Hardware has this restriction as it uses the order to retrieve information about
+the original event that was sent to the CPU. This contains information like atomic
+flow ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events.
+
 Stopping the EventDev
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index dee6723b70..98e9732100 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -72,6 +72,11 @@ New Features
   * Added support for independent enqueue feature. Updated Event Device and
     PMD feature list.
 
+  * Updated DLB2 driver for independent enqueue feature. Applications should
+    use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
+    capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index c43ab864ca..09e4107824 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -82,6 +82,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +99,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +310,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1451,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d", ret);
@@ -1541,13 +1558,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1630,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1954,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2002,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,10 +2810,34 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
+{
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+}
+
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
 {
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
 	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
 }
 
 static inline int
@@ -2815,7 +2857,7 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
 
 	DLB2_LOG_LINE_DBG("dlb2: consume immediate - %d QEs", num);
 
@@ -2835,7 +2877,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3028,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3204,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3584,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3620,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3628,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3807,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3826,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3881,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3912,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3927,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4274,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4282,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4325,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4333,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4386,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4413,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5087,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..564b4f18c6 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,30 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macro function to get QID depth of rte_event metadata.
+ * Currently lower 2 bits of 'rsvd' field are used to store QID depth.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+
+/**
+ * Macro function to set QID depth of rte_event metadata.
+ * Currently lower 2 bits of 'rsvd' field are used to store QID depth.
+ */
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+
+/**
+ * Macro function to get QE weight from rte_event metadata.
+ * Currently upper 2 bits of 'rsvd' field are used to store QE weight.
+ */
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+
+/**
+ * Macro function to set QE weight from rte_event metadata.
+ * Currently upper 2 bits of 'rsvd' field are used to store QE weight.
+ */
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v14 3/3] event/dsw: add capability for independent enqueue
  2024-10-03 20:49                       ` [PATCH v14 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-03 20:50                         ` [PATCH v14 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
  2024-10-03 20:50                         ` [PATCH v14 2/3] event/dlb2: add support " Abdullah Sevincer
@ 2024-10-03 20:50                         ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-03 20:50 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/rel_notes/release_24_11.rst | 1 +
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 98e9732100..4e4ca4fc23 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -76,6 +76,7 @@ New Features
     use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
     capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+  * Updated DSW driver for independent enqueue feature.
 
 
 Removed Items
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 8a1a2db8ac..9fb187bc74 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v14 1/3] eventdev: add support for independent enqueue
  2024-10-03 20:50                         ` [PATCH v14 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-10-05  6:51                           ` Jerin Jacob
  2024-10-05 17:51                           ` [PATCH v15 0/3] Independent Enqueue Support Abdullah Sevincer
                                             ` (3 subsequent siblings)
  4 siblings, 0 replies; 99+ messages in thread
From: Jerin Jacob @ 2024-10-05  6:51 UTC (permalink / raw)
  To: Abdullah Sevincer
  Cc: dev, jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On Fri, Oct 4, 2024 at 2:20 AM Abdullah Sevincer
<abdullah.sevincer@intel.com> wrote:
>
> This commit adds support for independent enqueue feature

Remove "This commit adds"

> and updates Event Device and PMD feature list.
>
> A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
> allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
> events on an event port where this capability is enabled.
>
> To use this capability applications need to set flag
> RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
> capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.
>
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>  doc/guides/eventdevs/features/default.ini |  1 +
>  doc/guides/eventdevs/features/dlb2.ini    |  1 +
>  doc/guides/rel_notes/release_24_11.rst    |  5 +++
>  lib/eventdev/rte_eventdev.h               | 37 +++++++++++++++++++++++
>  4 files changed, 44 insertions(+)
>
> diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
> index 1cc4303fe5..7c4ee99238 100644
> --- a/doc/guides/eventdevs/features/default.ini
> +++ b/doc/guides/eventdevs/features/default.ini
> @@ -22,6 +22,7 @@ carry_flow_id              =
>  maintenance_free           =
>  runtime_queue_attr         =
>  profile_links              =
> +independent_enq            =
>
>  ;
>  ; Features of a default Ethernet Rx adapter.
> diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
> index 7b80286927..c7193b47c1 100644
> --- a/doc/guides/eventdevs/features/dlb2.ini
> +++ b/doc/guides/eventdevs/features/dlb2.ini
> @@ -15,6 +15,7 @@ implicit_release_disable   = Y
>  runtime_port_link          = Y
>  multiple_queue_port        = Y
>  maintenance_free           = Y
> +independent_enq            = Y
>
>  [Eth Rx adapter Features]
>
> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index e0a9aa55a1..dee6723b70 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -67,6 +67,11 @@ New Features
>
>    The new statistics are useful for debugging and profiling.
>
> +* **Updated Event Device Library for independent enqueue feature**
> +
> +  * Added support for independent enqueue feature. Updated Event Device and
> +    PMD feature list.

No need to add "Updated Event Device and PMD feature list." Instead,
very short summary of features
can be added here.


> +
>
>  Removed Items
>  -------------
> diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
> index 08e5f9320b..3e3142d4a6 100644
> --- a/lib/eventdev/rte_eventdev.h
> +++ b/lib/eventdev/rte_eventdev.h
> @@ -446,6 +446,31 @@ struct rte_event;
>   * @see RTE_SCHED_TYPE_PARALLEL
>   */
>
> +#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
> +/**< Event device is capable of independent enqueue.
> + * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
> + * supports the enqueue in any order or specifically in a different order than the
> + * dequeue. Eventdev PMD can either transmit events in the changed order in which
> + * they are enqueued or restore the original order before sending them to the
> + * underlying hardware device. A flag is provided during the port configuration to
> + * inform Eventdev PMD that the application intends to use an independent enqueue
> + * order on a particular port. Note that this capability only matters for Eventdevs
> + * supporting burst mode.
> + *

----------- see below---
> + * To Inform PMD that the application plans to use independent enqueue order on a port
> + * this code example can be used:
> + *
> + *  if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> + *     port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;

--------------------------

Above section, please move to programming guide, no need to add code
in Doxygen comments.

> + *
> + * When an implicit release is enabled on a port, Eventdev PMD will also handle
> + * the insertion of RELEASE events in place of dropped events. The independent enqueue
> + * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
> + * will be transmitted in the order the application enqueues them and do not maintain

transmitted -> dequeded?

> + * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
> + * only applies to ports that have enabled independent enqueue feature.
> + */
> +
>  /* Event device priority levels */
>  #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
>  /**< Highest priority level for events and queues.
> @@ -1072,6 +1097,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
>   *
>   *  @see rte_event_port_setup()
>   */
> +#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
> +/**< Flag to enable independent enqueue. Must not be set if the device
> + * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
> + * allows an application to enqueue RTE_EVENT_OP_FORWARD or
> + * RTE_EVENT_OP_RELEASE in an order different than the order the
> + * events were dequeued from the event device, while maintaining
> + * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
> + *
> + * Note that this flag only matters for Eventdevs supporting burst mode.
> + *
> + *  @see rte_event_port_setup()
> + */
>
>  /** Event port configuration structure */
>  struct rte_event_port_conf {
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 99+ messages in thread

* Re: [PATCH v13 2/3] event/dlb2: add support for independent enqueue
  2024-10-03 20:36                         ` [PATCH v13 2/3] event/dlb2: " Abdullah Sevincer
@ 2024-10-05  7:02                           ` Jerin Jacob
  0 siblings, 0 replies; 99+ messages in thread
From: Jerin Jacob @ 2024-10-05  7:02 UTC (permalink / raw)
  To: Abdullah Sevincer
  Cc: dev, jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal

On Fri, Oct 4, 2024 at 6:34 PM Abdullah Sevincer
<abdullah.sevincer@intel.com> wrote:
>
> DLB devices need events to be enqueued in the same order they are
> dequeued. Applications are not suppose to change event order between
> dequeue and to enqueue. Since Eventdev standard does not add such
> restrictions independent enqueue support is needed for DLB PMD so that
> it restores dequeue order on enqueue if applications happen to change
> it. It also adds missing releases in places where events are dropped
> by the application and it expects implicit release to handle it.
>
> By default the feature will be off on all DLB ports and they will
> behave the same as older releases. To enable reordering feature,
> applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
> to port configuration if only the device advertises the capability
> RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.
>
> Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
> Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> ---
>  doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
>  doc/guides/rel_notes/release_24_11.rst      |   5 +
>  drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
>  drivers/event/dlb2/dlb2_avx512.c            |  27 +-
>  drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
>  drivers/event/dlb2/dlb2_priv.h              |  25 +-
>  drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
>  7 files changed, 422 insertions(+), 190 deletions(-)
>
> diff --git a/doc/guides/prog_guide/eventdev/eventdev.rst b/doc/guides/prog_guide/eventdev/eventdev.rst
> index fb6dfce102..dd22ab69d2 100644
> --- a/doc/guides/prog_guide/eventdev/eventdev.rst
> +++ b/doc/guides/prog_guide/eventdev/eventdev.rst
> +Independent Enqueue Capability
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +
> +Some eventdev hardware devices such as DLB2 expects all forwarded events to be

Remove DLB2 from _generic+ documentation. Instead, express with capability.

> +enqueued in the same order as they are dequeued. For dropped events, their
> +releases should come at the same location as the original event was expected.
> +Hardware has this restriction as it uses the order to retrieve information about
> +the original event that was sent to the CPU. This contains information like atomic flow
> +ID to release the flow lock and ordered events sequence number to restore the
> +original order.
> +
> +Some applications, like those based on the DPDK dispatcher library, want
> +enqueue order independence. To support this, DLB2 PMD supports the

Remove DLB2 from generic documentation.

> +``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
> +
> +This capability applies to Eventdevs supporting burst mode. On ports where
> +the application is going to change enqueue order,
> +``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
> +
> +Example code to inform PMD that the application plans to use independent enqueue
> +order on a port:
> +
> +    .. code-block:: c
> +
> +       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
> +         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
> +

--------------------
AS MENTIONED IN THE LAST comment, Please move this section to 1/3 patch.


> +This code example enables enqueue event reordering inside DLB2 PMD before the events
> +are sent to the DLB2 hardware. If the application is not going to change the enqueue
> +order, this flag should not be enabled to get better performance. DLB2 PMD saves
> +ordering information inside the impl_opaque field of the event, and this field should
> +be preserved for all FORWARD or RELEASE events.

This section is a candidate for DLB2 specific documentation. DONT ADD IN
doc/guides/prog_guide/eventdev/eventdev.rst instead move any DLB2
specific documentation
DLB2 specific document if needed.


> +
>  Stopping the EventDev
>  ~~~~~~~~~~~~~~~~~~~~~
>
> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index dee6723b70..98e9732100 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -72,6 +72,11 @@ New Features
>    * Added support for independent enqueue feature. Updated Event Device and
>      PMD feature list.
>
> +  * Updated DLB2 driver for independent enqueue feature. Applications should
> +    use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
> +    capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.

Starting from “Applications ...” is not specific to DLB2, remove from
release notes.

Please send the next version on Monday, planning to submit rc1 PR on Tuesday.

^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v15 0/3] Independent Enqueue Support
  2024-10-03 20:50                         ` [PATCH v14 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
  2024-10-05  6:51                           ` Jerin Jacob
@ 2024-10-05 17:51                           ` Abdullah Sevincer
  2024-10-05 17:51                             ` [PATCH v15 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
                                               ` (2 more replies)
  2024-10-05 18:20                           ` [PATCH v16 0/3] Independent Enqueue Support Abdullah Sevincer
                                             ` (2 subsequent siblings)
  4 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-05 17:51 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v15: Address comments.
v13: Address comments.
v13: Address comments.
v12: Address comments.
v11: Address comments.
v10: Add acked-by reviewer name.
v9: Address comments.
v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation.
v5: Address build issues.
v4: Address comments.
v3: Fix CI/build issues.
v2: Fix CI/build issues.
v1: Initial patchset.

Abdullah Sevincer (3):
  eventdev: add support for independent enqueue
  event/dlb2: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/features/default.ini   |   1 +
 doc/guides/eventdevs/features/dlb2.ini      |   1 +
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |  12 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 drivers/event/dsw/dsw_evdev.c               |   3 +-
 lib/eventdev/rte_eventdev.h                 |  31 ++
 11 files changed, 464 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v15 1/3] eventdev: add support for independent enqueue
  2024-10-05 17:51                           ` [PATCH v15 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-10-05 17:51                             ` Abdullah Sevincer
  2024-10-05 17:51                             ` [PATCH v15 2/3] event/dlb2: " Abdullah Sevincer
  2024-10-05 17:51                             ` [PATCH v15 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-05 17:51 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

Support for independent enqueue feature and updates Event Device
and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  6 +++++
 lib/eventdev/rte_eventdev.h               | 31 +++++++++++++++++++++++
 4 files changed, 39 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index e0a9aa55a1..436e2c566f 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -67,6 +67,12 @@ New Features
 
   The new statistics are useful for debugging and profiling.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. With this feature Eventdev 
+    supports enqueue in any order or specifically in a different order than
+    dequeue. The feature is intended for eventdevs supporting burst mode.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..73a44b2ac5 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,25 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either dequeue events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for eventdevs
+ * supporting burst mode.
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be dequeued in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1091,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v15 2/3] event/dlb2: add support for independent enqueue
  2024-10-05 17:51                           ` [PATCH v15 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-05 17:51                             ` [PATCH v15 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-10-05 17:51                             ` Abdullah Sevincer
  2024-10-05 17:51                             ` [PATCH v15 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-05 17:51 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |   4 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 7 files changed, 421 insertions(+), 190 deletions(-)

diff --git a/doc/guides/prog_guide/eventdev/eventdev.rst b/doc/guides/prog_guide/eventdev/eventdev.rst
index fb6dfce102..801e970021 100644
--- a/doc/guides/prog_guide/eventdev/eventdev.rst
+++ b/doc/guides/prog_guide/eventdev/eventdev.rst
@@ -472,6 +472,39 @@ A flush callback can be passed to the function to handle any outstanding events.
 
         Invocation of this API does not affect the existing port configuration.
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some eventdev hardware devices such as DLB2 expects all forwarded events to be
+enqueued in the same order as they are dequeued. For dropped events, their
+releases should come at the same location as the original event was expected.
+Hardware has this restriction as it uses the order to retrieve information about
+the original event that was sent to the CPU. This contains information like atomic
+flow ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events.
+
 Stopping the EventDev
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index 436e2c566f..d020277bf5 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -73,6 +73,10 @@ New Features
     supports enqueue in any order or specifically in a different order than
     dequeue. The feature is intended for eventdevs supporting burst mode.
 
+  * Updated DLB2 driver for independent enqueue feature. Applications should
+    use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
+    capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index c43ab864ca..09e4107824 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -82,6 +82,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +99,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +310,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1451,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d", ret);
@@ -1541,13 +1558,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1630,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1954,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2002,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,10 +2810,34 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
+{
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+}
+
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
 {
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
 	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
 }
 
 static inline int
@@ -2815,7 +2857,7 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
 
 	DLB2_LOG_LINE_DBG("dlb2: consume immediate - %d QEs", num);
 
@@ -2835,7 +2877,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3028,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3204,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3584,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3620,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3628,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3807,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask);
 		const __m128i v_sched_map = _mm_loadu_si128(
 					     (const __m128i *)sched_type_map);
 		__m128i v_sched_mask = _mm_loadu_si128(
@@ -3732,6 +3826,9 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 		__m128i v_preshift = _mm_and_si128(v_sched_remapped,
 						   v_sched_mask);
 		v_sched_done = _mm_srli_epi32(v_preshift, 10);
+		__m128i v_qid_depth =  _mm_and_si128(v_qe_status, v_qid_depth_mask);
+		v_qid_depth = _mm_srli_epi32(v_qid_depth, 15);
+		v_sched_done = _mm_or_si128(v_sched_done, v_qid_depth);
 	}
 
 	/* Priority handling
@@ -3784,9 +3881,10 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 					(const __m128i *)sub_event_mask);
 		__m128i v_flow_mask  = _mm_loadu_si128(
 				       (const __m128i *)flow_mask);
-		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 8);
+		__m128i v_sub = _mm_srli_epi32(v_qe_meta, 4);
 		v_sub = _mm_and_si128(v_sub, v_sub_event_mask);
-		__m128i v_type = _mm_and_si128(v_qe_meta, v_event_mask);
+		__m128i v_type = _mm_srli_epi32(v_qe_meta, 12);
+		v_type = _mm_and_si128(v_type, v_event_mask);
 		v_type = _mm_slli_epi32(v_type, 8);
 		v_types_done = _mm_or_si128(v_type, v_sub);
 		v_types_done = _mm_slli_epi32(v_types_done, 20);
@@ -3814,12 +3912,14 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 4:
 		v_ev_3 = _mm_blend_epi16(v_unpk_ev_23, v_qe_3, 0x0F);
 		v_ev_3 = _mm_alignr_epi8(v_ev_3, v_ev_3, 8);
+		v_ev_3 = _mm_insert_epi8(v_ev_3, qm_port->reorder_id + 3, 7);
 		_mm_storeu_si128((__m128i *)&events[3], v_ev_3);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched3],
 			      1);
 		/* fallthrough */
 	case 3:
 		v_ev_2 = _mm_unpacklo_epi64(v_unpk_ev_23, v_qe_2);
+		v_ev_2 = _mm_insert_epi8(v_ev_2, qm_port->reorder_id + 2, 7);
 		_mm_storeu_si128((__m128i *)&events[2], v_ev_2);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched2],
 			      1);
@@ -3827,16 +3927,19 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 	case 2:
 		v_ev_1 = _mm_blend_epi16(v_unpk_ev_01, v_qe_1, 0x0F);
 		v_ev_1 = _mm_alignr_epi8(v_ev_1, v_ev_1, 8);
+		v_ev_1 = _mm_insert_epi8(v_ev_1, qm_port->reorder_id + 1, 7);
 		_mm_storeu_si128((__m128i *)&events[1], v_ev_1);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched1],
 			      1);
 		/* fallthrough */
 	case 1:
 		v_ev_0 = _mm_unpacklo_epi64(v_unpk_ev_01, v_qe_0);
+		v_ev_0 = _mm_insert_epi8(v_ev_0, qm_port->reorder_id, 7);
 		_mm_storeu_si128((__m128i *)&events[0], v_ev_0);
 		DLB2_INC_STAT(qm_port->ev_port->stats.rx_sched_cnt[hw_sched0],
 			      1);
 	}
+	qm_port->reorder_id += valid_events;
 }
 
 static __rte_always_inline int
@@ -4171,6 +4274,7 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4178,8 +4282,21 @@ dlb2_event_dequeue_burst(void *event_port, struct rte_event *ev, uint16_t num,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
-
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+		if (qm_port->reorder_en) {
+			/* for directed, no-op command-byte = 0, but set dsi field */
+			/* for load-balanced, set COMP */
+			uint64_t release_u64 =
+			    qm_port->is_directed ? 0xFF : (uint64_t)DLB2_COMP_CMD_BYTE << 56;
+
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++)
+				if (order->enq_reorder[i].u64[1] == 0)
+					order->enq_reorder[i].u64[1] = release_u64;
+
+			__dlb2_event_enqueue_burst_reorder(event_port, NULL, 0,
+						   qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
@@ -4208,6 +4325,7 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 	struct dlb2_eventdev_port *ev_port = event_port;
 	struct dlb2_port *qm_port = &ev_port->qm_port;
 	struct dlb2_eventdev *dlb2 = ev_port->dlb2;
+	struct dlb2_reorder *order = qm_port->order;
 	uint16_t cnt;
 
 	RTE_ASSERT(ev_port->setup_done);
@@ -4215,9 +4333,35 @@ dlb2_event_dequeue_burst_sparse(void *event_port, struct rte_event *ev,
 
 	if (ev_port->implicit_release && ev_port->outstanding_releases > 0) {
 		uint16_t out_rels = ev_port->outstanding_releases;
+		if (qm_port->reorder_en) {
+			struct rte_event release_burst[8];
+			int num_releases = 0;
+
+			/* go through reorder buffer looking for missing releases. */
+			for (uint8_t i = order->next_to_enqueue; i != qm_port->reorder_id; i++) {
+				if (order->enq_reorder[i].u64[1] == 0) {
+					release_burst[num_releases++] = (struct rte_event){
+						.op = RTE_EVENT_OP_RELEASE,
+							.impl_opaque = i,
+					};
+
+					if (num_releases == RTE_DIM(release_burst)) {
+						__dlb2_event_enqueue_burst_reorder(event_port,
+							release_burst, RTE_DIM(release_burst),
+							qm_port->token_pop_mode == DELAYED_POP);
+						num_releases = 0;
+					}
+				}
+			}
 
-		dlb2_event_release(dlb2, ev_port->id, out_rels);
+			if (num_releases)
+				__dlb2_event_enqueue_burst_reorder(event_port, release_burst
+					, num_releases, qm_port->token_pop_mode == DELAYED_POP);
+		} else {
+			dlb2_event_release(dlb2, ev_port->id, out_rels);
+		}
 
+		RTE_ASSERT(ev_port->outstanding_releases == 0);
 		DLB2_INC_STAT(ev_port->stats.tx_implicit_rel, out_rels);
 	}
 
@@ -4242,6 +4386,8 @@ static void
 dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 {
 	struct dlb2_eventdev *dlb2 = dlb2_pmd_priv(dev);
+	struct dlb2_eventdev_port *ev_port = &dlb2->ev_ports[port_id];
+	struct dlb2_reorder *order = ev_port->qm_port.order;
 	eventdev_stop_flush_t flush;
 	struct rte_event ev;
 	uint8_t dev_id;
@@ -4267,8 +4413,10 @@ dlb2_flush_port(struct rte_eventdev *dev, int port_id)
 	/* Enqueue any additional outstanding releases */
 	ev.op = RTE_EVENT_OP_RELEASE;
 
-	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--)
+	for (i = dlb2->ev_ports[port_id].outstanding_releases; i > 0; i--) {
+		ev.impl_opaque = order ? order->next_to_enqueue : 0;
 		rte_event_enqueue_burst(dev_id, port_id, &ev, 1);
+	}
 }
 
 static uint32_t
@@ -4939,6 +5087,8 @@ dlb2_parse_params(const char *params,
 				rte_kvargs_free(kvlist);
 				return ret;
 			}
+			if (version == DLB2_HW_V2 && dlb2_args->enable_cq_weight)
+				DLB2_LOG_INFO("Ignoring 'enable_cq_weight=y'. Only supported for 2.5 HW onwards");
 
 			rte_kvargs_free(kvlist);
 		}
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
index 3c8906af9d..4f8c490f8c 100644
--- a/drivers/event/dlb2/dlb2_avx512.c
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -151,20 +151,20 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		 */
 #define DLB2_QE_EV_TYPE_WORD 0
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
+					     ev[0].sub_event_type << 4 |
+						ev[0].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
+					     ev[1].sub_event_type << 4 |
+						ev[1].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
+					     ev[2].sub_event_type << 4 |
+						ev[2].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD);
 		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
+					     ev[3].sub_event_type << 4 |
+						ev[3].event_type << 12,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
 		if (qm_port->use_avx512) {
@@ -238,11 +238,11 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 		}
 
 			/* will only be set for DLB 2.5 + */
-		if (qm_port->cq_weight) {
-			qe[0].weight = ev[0].impl_opaque & 3;
-			qe[1].weight = ev[1].impl_opaque & 3;
-			qe[2].weight = ev[2].impl_opaque & 3;
-			qe[3].weight = ev[3].impl_opaque & 3;
+		if (qm_port->dlb2->enable_cq_weight) {
+			qe[0].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[0]);
+			qe[1].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[1]);
+			qe[2].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[2]);
+			qe[3].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[3]);
 		}
 
 		break;
@@ -267,6 +267,7 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 			}
 			qe[i].u.event_type.major = ev[i].event_type;
 			qe[i].u.event_type.sub = ev[i].sub_event_type;
+			qe[i].weight = RTE_PMD_DLB2_GET_QE_WEIGHT(&ev[i]);
 		}
 		break;
 	case 0:
diff --git a/drivers/event/dlb2/dlb2_inline_fns.h b/drivers/event/dlb2/dlb2_inline_fns.h
index 1429281cfd..61a507d159 100644
--- a/drivers/event/dlb2/dlb2_inline_fns.h
+++ b/drivers/event/dlb2/dlb2_inline_fns.h
@@ -32,4 +32,12 @@ dlb2_movdir64b(void *dest, void *src)
 	: "a" (dest), "d" (src));
 }
 
+static inline void
+dlb2_movdir64b_single(void *pp_addr, void *qe4)
+{
+	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+		:
+	: "a" (pp_addr), "d" (qe4));
+}
+
 #endif /* _DLB2_INLINE_FNS_H_ */
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 2470ae0271..52da31ed31 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -29,7 +29,8 @@
 #define DLB2_SW_CREDIT_C_QUANTA_DEFAULT 256 /* Consumer */
 #define DLB2_DEPTH_THRESH_DEFAULT 256
 #define DLB2_MIN_CQ_DEPTH_OVERRIDE 32
-#define DLB2_MAX_CQ_DEPTH_OVERRIDE 128
+#define DLB2_MAX_CQ_DEPTH_OVERRIDE 1024
+#define DLB2_MAX_CQ_DEPTH_REORDER 128
 #define DLB2_MIN_ENQ_DEPTH_OVERRIDE 32
 #define DLB2_MAX_ENQ_DEPTH_OVERRIDE 1024
 
@@ -387,8 +388,23 @@ struct dlb2_port {
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
 	bool use_avx512;
-	uint32_t cq_weight;
 	bool is_producer; /* True if port is of type producer */
+	uint8_t reorder_id; /* id used for reordering events coming back into the scheduler */
+	bool reorder_en;
+	struct dlb2_reorder *order; /* For ordering enqueues */
+};
+
+struct dlb2_reorder {
+	/* a reorder buffer for events coming back in different order from dequeue
+	 * We use UINT8_MAX + 1 elements, but add on three no-ops to make movdirs easier at the end
+	 */
+	union {
+		__m128i m128;
+		struct dlb2_enqueue_qe qe;
+		uint64_t u64[2];
+	} enq_reorder[UINT8_MAX + 4];
+	/* id of the next entry in the reorder enqueue ring to send in */
+	uint8_t next_to_enqueue;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -642,10 +658,6 @@ struct dlb2_qid_depth_thresholds {
 	int val[DLB2_MAX_NUM_QUEUES_ALL];
 };
 
-struct dlb2_cq_weight {
-	int limit[DLB2_MAX_NUM_PORTS_ALL];
-};
-
 struct dlb2_port_cos {
 	int cos_id[DLB2_MAX_NUM_PORTS_ALL];
 };
@@ -667,7 +679,6 @@ struct dlb2_devargs {
 	bool vector_opts_enabled;
 	int max_cq_depth;
 	int max_enq_depth;
-	struct dlb2_cq_weight cq_weight;
 	struct dlb2_port_cos port_cos;
 	struct dlb2_cos_bw cos_bw;
 	const char *producer_coremask;
diff --git a/drivers/event/dlb2/rte_pmd_dlb2.h b/drivers/event/dlb2/rte_pmd_dlb2.h
index 334c6c356d..564b4f18c6 100644
--- a/drivers/event/dlb2/rte_pmd_dlb2.h
+++ b/drivers/event/dlb2/rte_pmd_dlb2.h
@@ -19,6 +19,30 @@ extern "C" {
 
 #include <rte_compat.h>
 
+/**
+ * Macro function to get QID depth of rte_event metadata.
+ * Currently lower 2 bits of 'rsvd' field are used to store QID depth.
+ */
+#define RTE_PMD_DLB2_GET_QID_DEPTH(x) ((x)->rsvd & 0x3)
+
+/**
+ * Macro function to set QID depth of rte_event metadata.
+ * Currently lower 2 bits of 'rsvd' field are used to store QID depth.
+ */
+#define RTE_PMD_DLB2_SET_QID_DEPTH(x, v) ((x)->rsvd = ((x)->rsvd & ~0x3) | (v & 0x3))
+
+/**
+ * Macro function to get QE weight from rte_event metadata.
+ * Currently upper 2 bits of 'rsvd' field are used to store QE weight.
+ */
+#define RTE_PMD_DLB2_GET_QE_WEIGHT(x) (((x)->rsvd >> 2) & 0x3)
+
+/**
+ * Macro function to set QE weight from rte_event metadata.
+ * Currently upper 2 bits of 'rsvd' field are used to store QE weight.
+ */
+#define RTE_PMD_DLB2_SET_QE_WEIGHT(x, v) ((x)->rsvd = ((x)->rsvd & 0x3) | ((v & 0x3) << 2))
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v15 3/3] event/dsw: add capability for independent enqueue
  2024-10-05 17:51                           ` [PATCH v15 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-05 17:51                             ` [PATCH v15 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
  2024-10-05 17:51                             ` [PATCH v15 2/3] event/dlb2: " Abdullah Sevincer
@ 2024-10-05 17:51                             ` Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-05 17:51 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

To use independent enqueue capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists. Hence, this
commit adds the capability of independent enqueue to the DSW driver.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/rel_notes/release_24_11.rst | 2 ++
 drivers/event/dsw/dsw_evdev.c          | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index d020277bf5..09f1ed0b95 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -77,6 +77,8 @@ New Features
     use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
     capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
 
+  * Updated DSW driver for independent enqueue feature.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dsw/dsw_evdev.c b/drivers/event/dsw/dsw_evdev.c
index 8a1a2db8ac..9fb187bc74 100644
--- a/drivers/event/dsw/dsw_evdev.c
+++ b/drivers/event/dsw/dsw_evdev.c
@@ -230,7 +230,8 @@ dsw_info_get(struct rte_eventdev *dev __rte_unused,
 		RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE|
 		RTE_EVENT_DEV_CAP_NONSEQ_MODE|
 		RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT|
-		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID
+		RTE_EVENT_DEV_CAP_CARRY_FLOW_ID |
+		RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ
 	};
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v16 0/3] Independent Enqueue Support
  2024-10-03 20:50                         ` [PATCH v14 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
  2024-10-05  6:51                           ` Jerin Jacob
  2024-10-05 17:51                           ` [PATCH v15 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-10-05 18:20                           ` Abdullah Sevincer
  2024-10-05 18:20                             ` [PATCH v16 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
                                               ` (2 more replies)
  2024-10-07  2:00                           ` [PATCH v17 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-07  8:13                           ` [PATCH v18 0/3] Independent Enqueue Support Abdullah Sevincer
  4 siblings, 3 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-05 18:20 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

v16: Address comments.
v15: Address comments.
v14: Address comments.
v13: Address comments.
v12: Address comments.
v11: Address comments.
v10: Add acked-by reviewer name.
v9: Address comments.
v8: Address build issues.
v7: Address documentation reviews.
v6: Update patch with more documentation.
v5: Address build issues.
v4: Address comments.
v3: Fix CI/build issues.
v2: Fix CI/build issues.
v1: Initial patchset.

Abdullah Sevincer (3):
  eventdev: add support for independent enqueue
  event/dlb2: add support for independent enqueue
  event/dsw: add capability for independent enqueue

 doc/guides/eventdevs/features/default.ini   |   1 +
 doc/guides/eventdevs/features/dlb2.ini      |   1 +
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |  12 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 drivers/event/dsw/dsw_evdev.c               |   3 +-
 lib/eventdev/rte_eventdev.h                 |  31 ++
 11 files changed, 464 insertions(+), 191 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v16 1/3] eventdev: add support for independent enqueue
  2024-10-05 18:20                           ` [PATCH v16 0/3] Independent Enqueue Support Abdullah Sevincer
@ 2024-10-05 18:20                             ` Abdullah Sevincer
  2024-10-05 18:20                             ` [PATCH v16 2/3] event/dlb2: " Abdullah Sevincer
  2024-10-05 18:20                             ` [PATCH v16 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-05 18:20 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

Support for independent enqueue feature and updates Event Device
and PMD feature list.

A new capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ is introduced. It
allows out-of-order enqueuing of RTE_EVENT_OP_FORWARD or RELEASE type
events on an event port where this capability is enabled.

To use this capability applications need to set flag
RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ during port setup only if the
capability RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ exists.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/eventdevs/features/default.ini |  1 +
 doc/guides/eventdevs/features/dlb2.ini    |  1 +
 doc/guides/rel_notes/release_24_11.rst    |  6 +++++
 lib/eventdev/rte_eventdev.h               | 31 +++++++++++++++++++++++
 4 files changed, 39 insertions(+)

diff --git a/doc/guides/eventdevs/features/default.ini b/doc/guides/eventdevs/features/default.ini
index 1cc4303fe5..7c4ee99238 100644
--- a/doc/guides/eventdevs/features/default.ini
+++ b/doc/guides/eventdevs/features/default.ini
@@ -22,6 +22,7 @@ carry_flow_id              =
 maintenance_free           =
 runtime_queue_attr         =
 profile_links              =
+independent_enq            =
 
 ;
 ; Features of a default Ethernet Rx adapter.
diff --git a/doc/guides/eventdevs/features/dlb2.ini b/doc/guides/eventdevs/features/dlb2.ini
index 7b80286927..c7193b47c1 100644
--- a/doc/guides/eventdevs/features/dlb2.ini
+++ b/doc/guides/eventdevs/features/dlb2.ini
@@ -15,6 +15,7 @@ implicit_release_disable   = Y
 runtime_port_link          = Y
 multiple_queue_port        = Y
 maintenance_free           = Y
+independent_enq            = Y
 
 [Eth Rx adapter Features]
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index e0a9aa55a1..cab9f16e01 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -67,6 +67,12 @@ New Features
 
   The new statistics are useful for debugging and profiling.
 
+* **Updated Event Device Library for independent enqueue feature**
+
+  * Added support for independent enqueue feature. With this feature Eventdev
+    supports enqueue in any order or specifically in a different order than
+    dequeue. The feature is intended for eventdevs supporting burst mode.
+
 
 Removed Items
 -------------
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 08e5f9320b..73a44b2ac5 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -446,6 +446,25 @@ struct rte_event;
  * @see RTE_SCHED_TYPE_PARALLEL
  */
 
+#define RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ  (1ULL << 16)
+/**< Event device is capable of independent enqueue.
+ * A new capability, RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ, will indicate that Eventdev
+ * supports the enqueue in any order or specifically in a different order than the
+ * dequeue. Eventdev PMD can either dequeue events in the changed order in which
+ * they are enqueued or restore the original order before sending them to the
+ * underlying hardware device. A flag is provided during the port configuration to
+ * inform Eventdev PMD that the application intends to use an independent enqueue
+ * order on a particular port. Note that this capability only matters for eventdevs
+ * supporting burst mode.
+ *
+ * When an implicit release is enabled on a port, Eventdev PMD will also handle
+ * the insertion of RELEASE events in place of dropped events. The independent enqueue
+ * feature only applies to FORWARD and RELEASE events. New events (op=RTE_EVENT_OP_NEW)
+ * will be dequeued in the order the application enqueues them and do not maintain
+ * any order relative to FORWARD/RELEASE events. FORWARD vs NEW relaxed ordering
+ * only applies to ports that have enabled independent enqueue feature.
+ */
+
 /* Event device priority levels */
 #define RTE_EVENT_DEV_PRIORITY_HIGHEST   0
 /**< Highest priority level for events and queues.
@@ -1072,6 +1091,18 @@ rte_event_queue_attr_set(uint8_t dev_id, uint8_t queue_id, uint32_t attr_id,
  *
  *  @see rte_event_port_setup()
  */
+#define RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ   (1ULL << 5)
+/**< Flag to enable independent enqueue. Must not be set if the device
+ * is not RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ capable. This feature
+ * allows an application to enqueue RTE_EVENT_OP_FORWARD or
+ * RTE_EVENT_OP_RELEASE in an order different than the order the
+ * events were dequeued from the event device, while maintaining
+ * RTE_SCHED_TYPE_ATOMIC or RTE_SCHED_TYPE_ORDERED semantics.
+ *
+ * Note that this flag only matters for Eventdevs supporting burst mode.
+ *
+ *  @see rte_event_port_setup()
+ */
 
 /** Event port configuration structure */
 struct rte_event_port_conf {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 99+ messages in thread

* [PATCH v16 2/3] event/dlb2: add support for independent enqueue
  2024-10-05 18:20                           ` [PATCH v16 0/3] Independent Enqueue Support Abdullah Sevincer
  2024-10-05 18:20                             ` [PATCH v16 1/3] eventdev: add support for independent enqueue Abdullah Sevincer
@ 2024-10-05 18:20                             ` Abdullah Sevincer
  2024-10-05 18:20                             ` [PATCH v16 3/3] event/dsw: add capability " Abdullah Sevincer
  2 siblings, 0 replies; 99+ messages in thread
From: Abdullah Sevincer @ 2024-10-05 18:20 UTC (permalink / raw)
  To: dev
  Cc: jerinj, bruce.richardson, pravin.pathak, mattias.ronnblom,
	manish.aggarwal, Abdullah Sevincer

DLB devices need events to be enqueued in the same order they are
dequeued. Applications are not suppose to change event order between
dequeue and to enqueue. Since Eventdev standard does not add such
restrictions independent enqueue support is needed for DLB PMD so that
it restores dequeue order on enqueue if applications happen to change
it. It also adds missing releases in places where events are dropped
by the application and it expects implicit release to handle it.

By default the feature will be off on all DLB ports and they will
behave the same as older releases. To enable reordering feature,
applications need to add the flag RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ
to port configuration if only the device advertises the capability
RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ.

Signed-off-by: Abdullah Sevincer <abdullah.sevincer@intel.com>
Acked-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
---
 doc/guides/prog_guide/eventdev/eventdev.rst |  33 ++
 doc/guides/rel_notes/release_24_11.rst      |   4 +
 drivers/event/dlb2/dlb2.c                   | 490 +++++++++++++-------
 drivers/event/dlb2/dlb2_avx512.c            |  27 +-
 drivers/event/dlb2/dlb2_inline_fns.h        |   8 +
 drivers/event/dlb2/dlb2_priv.h              |  25 +-
 drivers/event/dlb2/rte_pmd_dlb2.h           |  24 +
 7 files changed, 421 insertions(+), 190 deletions(-)

diff --git a/doc/guides/prog_guide/eventdev/eventdev.rst b/doc/guides/prog_guide/eventdev/eventdev.rst
index fb6dfce102..801e970021 100644
--- a/doc/guides/prog_guide/eventdev/eventdev.rst
+++ b/doc/guides/prog_guide/eventdev/eventdev.rst
@@ -472,6 +472,39 @@ A flush callback can be passed to the function to handle any outstanding events.
 
         Invocation of this API does not affect the existing port configuration.
 
+Independent Enqueue Capability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some eventdev hardware devices such as DLB2 expects all forwarded events to be
+enqueued in the same order as they are dequeued. For dropped events, their
+releases should come at the same location as the original event was expected.
+Hardware has this restriction as it uses the order to retrieve information about
+the original event that was sent to the CPU. This contains information like atomic
+flow ID to release the flow lock and ordered events sequence number to restore the
+original order.
+
+Some applications, like those based on the DPDK dispatcher library, want
+enqueue order independence. To support this, DLB2 PMD supports the
+``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` capability.
+
+This capability applies to Eventdevs supporting burst mode. On ports where
+the application is going to change enqueue order,
+``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` support should be enabled.
+
+Example code to inform PMD that the application plans to use independent enqueue
+order on a port:
+
+    .. code-block:: c
+
+       if (capability & RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ)
+         port_config = port_config | RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
+
+This code example enables enqueue event reordering inside DLB2 PMD before the events
+are sent to the DLB2 hardware. If the application is not going to change the enqueue
+order, this flag should not be enabled to get better performance. DLB2 PMD saves
+ordering information inside the impl_opaque field of the event, and this field should
+be preserved for all FORWARD or RELEASE events.
+
 Stopping the EventDev
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
index cab9f16e01..70f0269779 100644
--- a/doc/guides/rel_notes/release_24_11.rst
+++ b/doc/guides/rel_notes/release_24_11.rst
@@ -73,6 +73,10 @@ New Features
     supports enqueue in any order or specifically in a different order than
     dequeue. The feature is intended for eventdevs supporting burst mode.
 
+  * Updated DLB2 driver for independent enqueue feature. Applications should
+    use ``RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ`` to enable the feature if the
+    capability  ``RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ`` exists.
+
 
 Removed Items
 -------------
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index c43ab864ca..09e4107824 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -82,6 +82,7 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
 			  RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE |
 			  RTE_EVENT_DEV_CAP_RUNTIME_PORT_LINK |
 			  RTE_EVENT_DEV_CAP_MULTIPLE_QUEUE_PORT |
+			  RTE_EVENT_DEV_CAP_INDEPENDENT_ENQ |
 			  RTE_EVENT_DEV_CAP_MAINTENANCE_FREE),
 	.max_profiles_per_port = 1,
 };
@@ -98,6 +99,11 @@ dlb2_free_qe_mem(struct dlb2_port *qm_port)
 	rte_free(qm_port->qe4);
 	qm_port->qe4 = NULL;
 
+	if (qm_port->order) {
+		rte_free(qm_port->order);
+		qm_port->order = NULL;
+	}
+
 	rte_free(qm_port->int_arm_qe);
 	qm_port->int_arm_qe = NULL;
 
@@ -304,7 +310,7 @@ set_max_cq_depth(const char *key __rte_unused,
 	if (*max_cq_depth < DLB2_MIN_CQ_DEPTH_OVERRIDE ||
 	    *max_cq_depth > DLB2_MAX_CQ_DEPTH_OVERRIDE ||
 	    !rte_is_power_of_2(*max_cq_depth)) {
-		DLB2_LOG_ERR("dlb2: max_cq_depth %d and %d and a power of 2",
+		DLB2_LOG_ERR("dlb2: Allowed max_cq_depth range %d - %d and should be power of 2",
 			     DLB2_MIN_CQ_DEPTH_OVERRIDE,
 			     DLB2_MAX_CQ_DEPTH_OVERRIDE);
 		return -EINVAL;
@@ -1445,6 +1451,17 @@ dlb2_init_qe_mem(struct dlb2_port *qm_port, char *mz_name)
 		goto error_exit;
 	}
 
+	if (qm_port->reorder_en) {
+		sz = sizeof(struct dlb2_reorder);
+		qm_port->order = rte_zmalloc(mz_name, sz, RTE_CACHE_LINE_SIZE);
+
+		if (qm_port->order == NULL) {
+			DLB2_LOG_ERR("dlb2: no reorder memory");
+			ret = -ENOMEM;
+			goto error_exit;
+		}
+	}
+
 	ret = dlb2_init_int_arm_qe(qm_port, mz_name);
 	if (ret < 0) {
 		DLB2_LOG_ERR("dlb2: dlb2_init_int_arm_qe ret=%d", ret);
@@ -1541,13 +1558,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 		return -EINVAL;
 	}
 
-	if (dlb2->version == DLB2_HW_V2 && ev_port->cq_weight != 0 &&
-	    ev_port->cq_weight > dequeue_depth) {
-		DLB2_LOG_ERR("dlb2: invalid cq dequeue depth %d, must be >= cq weight %d",
-			     dequeue_depth, ev_port->cq_weight);
-		return -EINVAL;
-	}
-
 	rte_spinlock_lock(&handle->resource_lock);
 
 	/* We round up to the next power of 2 if necessary */
@@ -1620,9 +1630,6 @@ dlb2_hw_create_ldb_port(struct dlb2_eventdev *dlb2,
 					dlb2_error_strings[cfg.response.  status]);
 			goto error_exit;
 		}
-		qm_port->cq_weight = dequeue_depth;
-	} else {
-		qm_port->cq_weight = 0;
 	}
 
 	/* CQs with depth < 8 use an 8-entry queue, but withhold credits so
@@ -1947,6 +1954,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 		evdev_dlb2_default_info.max_event_port_enqueue_depth)
 		return -EINVAL;
 
+	if ((port_conf->event_port_cfg & RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ) &&
+	    port_conf->dequeue_depth > DLB2_MAX_CQ_DEPTH_REORDER) {
+		DLB2_LOG_ERR("evport %d: Max dequeue depth supported with reorder is %d",
+			     ev_port_id, DLB2_MAX_CQ_DEPTH_REORDER);
+		return -EINVAL;
+	}
+
 	ev_port = &dlb2->ev_ports[ev_port_id];
 	/* configured? */
 	if (ev_port->setup_done) {
@@ -1988,7 +2002,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 			     hw_credit_quanta);
 		return -EINVAL;
 	}
-	ev_port->enq_retries = port_conf->enqueue_depth / sw_credit_quanta;
+	ev_port->enq_retries = port_conf->enqueue_depth;
+
+	ev_port->qm_port.reorder_id = 0;
+	ev_port->qm_port.reorder_en = port_conf->event_port_cfg &
+				      RTE_EVENT_PORT_CFG_INDEPENDENT_ENQ;
 
 	/* Save off port config for reconfig */
 	ev_port->conf = *port_conf;
@@ -2792,10 +2810,34 @@ dlb2_check_enqueue_hw_credits(struct dlb2_port *qm_port)
 }
 
 static __rte_always_inline void
-dlb2_pp_write(struct dlb2_enqueue_qe *qe4,
-	      struct process_local_port_data *port_data)
+dlb2_pp_write(struct process_local_port_data *port_data, struct dlb2_enqueue_qe *qe4)
+{
+	dlb2_movdir64b(port_data->pp_addr, qe4);
+}
+
+static __rte_always_inline void
+dlb2_pp_write_reorder(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
+{
+	for (uint8_t i = 0; i < 4; i++) {
+		if (qe4[i].cmd_byte != DLB2_NOOP_CMD_BYTE) {
+			dlb2_movdir64b(port_data->pp_addr, qe4);
+			return;
+		}
+	}
+}
+
+static __rte_always_inline int
+dlb2_pp_check4_write(struct process_local_port_data *port_data,
+	      struct dlb2_enqueue_qe *qe4)
 {
+	for (uint8_t i = 0; i < DLB2_NUM_QES_PER_CACHE_LINE; i++)
+		if (((uint64_t *)&qe4[i])[1] == 0)
+			return 0;
+
 	dlb2_movdir64b(port_data->pp_addr, qe4);
+	memset(qe4, 0, DLB2_NUM_QES_PER_CACHE_LINE * sizeof(struct dlb2_enqueue_qe));
+	return DLB2_NUM_QES_PER_CACHE_LINE;
 }
 
 static inline int
@@ -2815,7 +2857,7 @@ dlb2_consume_qe_immediate(struct dlb2_port *qm_port, int num)
 	 */
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
 
-	dlb2_movntdq_single(port_data->pp_addr, qe);
+	dlb2_movdir64b_single(port_data->pp_addr, qe);
 
 	DLB2_LOG_LINE_DBG("dlb2: consume immediate - %d QEs", num);
 
@@ -2835,7 +2877,7 @@ dlb2_hw_do_enqueue(struct dlb2_port *qm_port,
 	if (do_sfence)
 		rte_wmb();
 
-	dlb2_pp_write(qm_port->qe4, port_data);
+	dlb2_pp_write(port_data, qm_port->qe4);
 }
 
 static inline void
@@ -2986,6 +3028,166 @@ dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 	return 0;
 }
 
+static inline __m128i
+dlb2_event_to_qe(const struct rte_event *ev, uint8_t cmd, uint8_t sched_type, uint8_t qid)
+{
+	__m128i dlb2_to_qe_shuffle = _mm_set_epi8(
+	    0xFF, 0xFF,			 /* zero out cmd word */
+	    1, 0,			 /* low 16-bits of flow id */
+	    0xFF, 0xFF, /* zero QID, sched_type etc fields to be filled later */
+	    3, 2,			 /* top of flow id, event type and subtype */
+	    15, 14, 13, 12, 11, 10, 9, 8 /* data from end of event goes at start */
+	);
+
+	/* event may not be 16 byte aligned. Use 16 byte unaligned load */
+	__m128i tmp = _mm_lddqu_si128((const __m128i *)ev);
+	__m128i qe = _mm_shuffle_epi8(tmp, dlb2_to_qe_shuffle);
+	struct dlb2_enqueue_qe *dq = (struct dlb2_enqueue_qe *)&qe;
+	/* set the cmd field */
+	qe = _mm_insert_epi8(qe, cmd, 15);
+	/* insert missing 16-bits with qid, sched_type and priority */
+	uint16_t qid_stype_prio =
+	    qid | (uint16_t)sched_type << 8 | ((uint16_t)ev->priority & 0xE0) << 5;
+	qe = _mm_insert_epi16(qe, qid_stype_prio, 5);
+	dq->weight = RTE_PMD_DLB2_GET_QE_WEIGHT(ev);
+	return qe;
+}
+
+static inline uint16_t
+__dlb2_event_enqueue_burst_reorder(void *event_port,
+		const struct rte_event events[],
+		uint16_t num,
+		bool use_delayed)
+{
+	struct dlb2_eventdev_port *ev_port = event_port;
+	struct dlb2_port *qm_port = &ev_port->qm_port;
+	struct dlb2_reorder *order = qm_port->order;
+	struct process_local_port_data *port_data;
+	bool is_directed = qm_port->is_directed;
+	uint8_t n = order->next_to_enqueue;
+	uint8_t p_cnt = 0;
+	int retries = ev_port->enq_retries;
+	__m128i new_qes[4], *from = NULL;
+	int num_new = 0;
+	int num_tx;
+	int i;
+
+	RTE_ASSERT(ev_port->enq_configured);
+	RTE_ASSERT(events != NULL);
+
+	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
+
+	num_tx = RTE_MIN(num, ev_port->conf.enqueue_depth);
+#if DLB2_BYPASS_FENCE_ON_PP == 1
+	if (!qm_port->is_producer) /* Call memory fense once at the start */
+		rte_wmb();	   /*  calls _mm_sfence() */
+#else
+	rte_wmb(); /*  calls _mm_sfence() */
+#endif
+	for (i = 0; i < num_tx; i++) {
+		uint8_t sched_type = 0;
+		uint8_t reorder_idx = events[i].impl_opaque;
+		int16_t thresh = qm_port->token_pop_thresh;
+		uint8_t qid = 0;
+		int ret;
+
+		while ((ret = dlb2_event_enqueue_prep(ev_port, qm_port, &events[i],
+						      &sched_type, &qid)) != 0 &&
+		       rte_errno == -ENOSPC && --retries > 0)
+			rte_pause();
+
+		if (ret != 0) /* Either there is error or retires exceeded */
+			break;
+
+		switch (events[i].op) {
+		case RTE_EVENT_OP_NEW:
+			new_qes[num_new++] = dlb2_event_to_qe(
+			    &events[i], DLB2_NEW_CMD_BYTE, sched_type, qid);
+			if (num_new == RTE_DIM(new_qes)) {
+				dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+				num_new = 0;
+			}
+			break;
+		case RTE_EVENT_OP_FORWARD: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NEW_CMD_BYTE : DLB2_FWD_CMD_BYTE,
+			    sched_type, qid);
+			n += dlb2_pp_check4_write(port_data, &order->enq_reorder[n].qe);
+			break;
+		}
+		case RTE_EVENT_OP_RELEASE: {
+			order->enq_reorder[reorder_idx].m128 = dlb2_event_to_qe(
+			    &events[i], is_directed ? DLB2_NOOP_CMD_BYTE : DLB2_COMP_CMD_BYTE,
+			    sched_type, 0xFF);
+			break;
+		}
+		}
+
+		if (use_delayed && qm_port->token_pop_mode == DELAYED_POP &&
+		    (events[i].op == RTE_EVENT_OP_FORWARD ||
+		     events[i].op == RTE_EVENT_OP_RELEASE) &&
+		    qm_port->issued_releases >= thresh - 1) {
+
+			dlb2_consume_qe_immediate(qm_port, qm_port->owed_tokens);
+
+			/* Reset the releases for the next QE batch */
+			qm_port->issued_releases -= thresh;
+
+			/* When using delayed token pop mode, the
+			 * initial token threshold is the full CQ
+			 * depth. After the first token pop, we need to
+			 * reset it to the dequeue_depth.
+			 */
+			qm_port->token_pop_thresh =
+			    qm_port->dequeue_depth;
+		}
+	}
+	while (order->enq_reorder[n].u64[1] != 0) {
+		__m128i tmp[4] = {0}, *send = NULL;
+		bool enq;
+
+		if (!p_cnt)
+			from = &order->enq_reorder[n].m128;
+
+		p_cnt++;
+		n++;
+
+		enq = !n || p_cnt == 4 || !order->enq_reorder[n].u64[1];
+		if (!enq)
+			continue;
+
+		if (p_cnt < 4) {
+			memcpy(tmp, from, p_cnt * sizeof(struct dlb2_enqueue_qe));
+			send = tmp;
+		} else {
+			send  = from;
+		}
+
+		if (is_directed)
+			dlb2_pp_write_reorder(port_data, (struct dlb2_enqueue_qe *)send);
+		else
+			dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)send);
+		memset(from, 0, p_cnt * sizeof(struct dlb2_enqueue_qe));
+		p_cnt = 0;
+	}
+	order->next_to_enqueue = n;
+
+	if (num_new > 0) {
+		switch (num_new) {
+		case 1:
+			new_qes[1] = _mm_setzero_si128(); /* fall-through */
+		case 2:
+			new_qes[2] = _mm_setzero_si128(); /* fall-through */
+		case 3:
+			new_qes[3] = _mm_setzero_si128();
+		}
+		dlb2_pp_write(port_data, (struct dlb2_enqueue_qe *)&new_qes);
+		num_new = 0;
+	}
+
+	return i;
+}
+
 static inline uint16_t
 __dlb2_event_enqueue_burst(void *event_port,
 			   const struct rte_event events[],
@@ -3002,6 +3204,9 @@ __dlb2_event_enqueue_burst(void *event_port,
 	RTE_ASSERT(ev_port->enq_configured);
 	RTE_ASSERT(events != NULL);
 
+	if (qm_port->reorder_en)
+		return __dlb2_event_enqueue_burst_reorder(event_port, events, num, use_delayed);
+
 	i = 0;
 
 	port_data = &dlb2_port[qm_port->id][PORT_TYPE(qm_port)];
@@ -3379,7 +3584,8 @@ dlb2_process_dequeue_qes(struct dlb2_eventdev_port *ev_port,
 		events[num].event_type = qe->u.event_type.major;
 		events[num].sub_event_type = qe->u.event_type.sub;
 		events[num].sched_type = sched_type_map[qe->sched_type];
-		events[num].impl_opaque = qe->qid_depth;
+		events[num].impl_opaque = qm_port->reorder_id++;
+		RTE_PMD_DLB2_SET_QID_DEPTH(&events[num], qe->qid_depth);
 
 		/* qid not preserved for directed queues */
 		if (qm_port->is_directed)
@@ -3414,7 +3620,6 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 	};
 	const int num_events = DLB2_NUM_QES_PER_CACHE_LINE;
 	uint8_t *qid_mappings = qm_port->qid_mappings;
-	__m128i sse_evt[2];
 
 	/* In the unlikely case that any of the QE error bits are set, process
 	 * them one at a time.
@@ -3423,153 +3628,33 @@ dlb2_process_dequeue_four_qes(struct dlb2_eventdev_port *ev_port,
 		     qes[2].error || qes[3].error))
 		return dlb2_process_dequeue_qes(ev_port, qm_port, events,
 						 qes, num_events);
+	const __m128i qe_to_ev_shuffle =
+	    _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, /* last 8-bytes = data from first 8 */
+			 0xFF, 0xFF, 0xFF, 0xFF, /* fill in later as 32-bit value*/
+			 9, 8,			 /* event type and sub-event, + 4 zero bits */
+			 13, 12 /* flow id, 16 bits */);
+	for (int i = 0; i < 4; i++) {
+		const __m128i hw_qe = _mm_load_si128((void *)&qes[i]);
+		const __m128i event = _mm_shuffle_epi8(hw_qe, qe_to_ev_shuffle);
+		/* prepare missing 32-bits for op, sched_type, QID, Priority and
+		 * sequence number in impl_opaque
+		 */
+		const uint16_t qid_sched_prio = _mm_extract_epi16(hw_qe, 5);
+		/* Extract qid_depth and format it as per event header */
+		const uint8_t qid_depth = (_mm_extract_epi8(hw_qe, 15) & 0x6) << 1;
+		const uint32_t qid =  (qm_port->is_directed) ? ev_port->link[0].queue_id :
+					qid_mappings[(uint8_t)qid_sched_prio];
+		const uint32_t sched_type = sched_type_map[(qid_sched_prio >> 8) & 0x3];
+		const uint32_t priority = (qid_sched_prio >> 5) & 0xE0;
 
-	events[0].u64 = qes[0].data;
-	events[1].u64 = qes[1].data;
-	events[2].u64 = qes[2].data;
-	events[3].u64 = qes[3].data;
-
-	/* Construct the metadata portion of two struct rte_events
-	 * in one 128b SSE register. Event metadata is constructed in the SSE
-	 * registers like so:
-	 * sse_evt[0][63:0]:   event[0]'s metadata
-	 * sse_evt[0][127:64]: event[1]'s metadata
-	 * sse_evt[1][63:0]:   event[2]'s metadata
-	 * sse_evt[1][127:64]: event[3]'s metadata
-	 */
-	sse_evt[0] = _mm_setzero_si128();
-	sse_evt[1] = _mm_setzero_si128();
-
-	/* Convert the hardware queue ID to an event queue ID and store it in
-	 * the metadata:
-	 * sse_evt[0][47:40]   = qid_mappings[qes[0].qid]
-	 * sse_evt[0][111:104] = qid_mappings[qes[1].qid]
-	 * sse_evt[1][47:40]   = qid_mappings[qes[2].qid]
-	 * sse_evt[1][111:104] = qid_mappings[qes[3].qid]
-	 */
-#define DLB_EVENT_QUEUE_ID_BYTE 5
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[0].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-				     qid_mappings[qes[1].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[2].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-				     qid_mappings[qes[3].qid],
-				     DLB_EVENT_QUEUE_ID_BYTE + 8);
-
-	/* Convert the hardware priority to an event priority and store it in
-	 * the metadata, while also returning the queue depth status
-	 * value captured by the hardware, storing it in impl_opaque, which can
-	 * be read by the application but not modified
-	 * sse_evt[0][55:48]   = DLB2_TO_EV_PRIO(qes[0].priority)
-	 * sse_evt[0][63:56]   = qes[0].qid_depth
-	 * sse_evt[0][119:112] = DLB2_TO_EV_PRIO(qes[1].priority)
-	 * sse_evt[0][127:120] = qes[1].qid_depth
-	 * sse_evt[1][55:48]   = DLB2_TO_EV_PRIO(qes[2].priority)
-	 * sse_evt[1][63:56]   = qes[2].qid_depth
-	 * sse_evt[1][119:112] = DLB2_TO_EV_PRIO(qes[3].priority)
-	 * sse_evt[1][127:120] = qes[3].qid_depth
-	 */
-#define DLB_EVENT_PRIO_IMPL_OPAQUE_WORD 3
-#define DLB_BYTE_SHIFT 8
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[0].priority) |
-			(qes[0].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[0] =
-		_mm_insert_epi16(sse_evt[0],
-			DLB2_TO_EV_PRIO((uint8_t)qes[1].priority) |
-			(qes[1].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[2].priority) |
-			(qes[2].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD);
-	sse_evt[1] =
-		_mm_insert_epi16(sse_evt[1],
-			DLB2_TO_EV_PRIO((uint8_t)qes[3].priority) |
-			(qes[3].qid_depth << DLB_BYTE_SHIFT),
-			DLB_EVENT_PRIO_IMPL_OPAQUE_WORD + 4);
-
-	/* Write the event type, sub event type, and flow_id to the event
-	 * metadata.
-	 * sse_evt[0][31:0]   = qes[0].flow_id |
-	 *			qes[0].u.event_type.major << 28 |
-	 *			qes[0].u.event_type.sub << 20;
-	 * sse_evt[0][95:64]  = qes[1].flow_id |
-	 *			qes[1].u.event_type.major << 28 |
-	 *			qes[1].u.event_type.sub << 20;
-	 * sse_evt[1][31:0]   = qes[2].flow_id |
-	 *			qes[2].u.event_type.major << 28 |
-	 *			qes[2].u.event_type.sub << 20;
-	 * sse_evt[1][95:64]  = qes[3].flow_id |
-	 *			qes[3].u.event_type.major << 28 |
-	 *			qes[3].u.event_type.sub << 20;
-	 */
-#define DLB_EVENT_EV_TYPE_DW 0
-#define DLB_EVENT_EV_TYPE_SHIFT 28
-#define DLB_EVENT_SUB_EV_TYPE_SHIFT 20
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[0].flow_id |
-			qes[0].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[0].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[0] = _mm_insert_epi32(sse_evt[0],
-			qes[1].flow_id |
-			qes[1].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[1].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[2].flow_id |
-			qes[2].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT |
-			qes[2].u.event_type.sub <<  DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW);
-	sse_evt[1] = _mm_insert_epi32(sse_evt[1],
-			qes[3].flow_id |
-			qes[3].u.event_type.major << DLB_EVENT_EV_TYPE_SHIFT  |
-			qes[3].u.event_type.sub << DLB_EVENT_SUB_EV_TYPE_SHIFT,
-			DLB_EVENT_EV_TYPE_DW + 2);
-
-	/* Write the sched type to the event metadata. 'op' and 'rsvd' are not
-	 * set:
-	 * sse_evt[0][39:32]  = sched_type_map[qes[0].sched_type] << 6
-	 * sse_evt[0][103:96] = sched_type_map[qes[1].sched_type] << 6
-	 * sse_evt[1][39:32]  = sched_type_map[qes[2].sched_type] << 6
-	 * sse_evt[1][103:96] = sched_type_map[qes[3].sched_type] << 6
-	 */
-#define DLB_EVENT_SCHED_TYPE_BYTE 4
-#define DLB_EVENT_SCHED_TYPE_SHIFT 6
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[0].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[0] = _mm_insert_epi8(sse_evt[0],
-		sched_type_map[qes[1].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[2].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE);
-	sse_evt[1] = _mm_insert_epi8(sse_evt[1],
-		sched_type_map[qes[3].sched_type] << DLB_EVENT_SCHED_TYPE_SHIFT,
-		DLB_EVENT_SCHED_TYPE_BYTE + 8);
-
-	/* Store the metadata to the event (use the double-precision
-	 * _mm_storeh_pd because there is no integer function for storing the
-	 * upper 64b):
-	 * events[0].event = sse_evt[0][63:0]
-	 * events[1].event = sse_evt[0][127:64]
-	 * events[2].event = sse_evt[1][63:0]
-	 * events[3].event = sse_evt[1][127:64]
-	 */
-	_mm_storel_epi64((__m128i *)&events[0].event, sse_evt[0]);
-	_mm_storeh_pd((double *)&events[1].event, (__m128d) sse_evt[0]);
-	_mm_storel_epi64((__m128i *)&events[2].event, sse_evt[1]);
-	_mm_storeh_pd((double *)&events[3].event, (__m128d) sse_evt[1]);
+		const uint32_t dword1 = qid_depth |
+		    sched_type << 6 | qid << 8 | priority << 16 | (qm_port->reorder_id + i) << 24;
+
+		/* events[] may not be 16 byte aligned. So use separate load and store */
+		const __m128i tmpEv = _mm_insert_epi32(event, dword1, 1);
+		_mm_storeu_si128((__m128i *) &events[i], tmpEv);
+	}
+	qm_port->reorder_id += 4;
 
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[0].sched_type], 1);
 	DLB2_INC_STAT(ev_port->stats.rx_sched_cnt[qes[1].sched_type], 1);
@@ -3722,6 +3807,15 @@ _process_deq_qes_vec_impl(struct dlb2_port *qm_port,
 			0x00, 0x00, 0x00, 0x03,
 			0x00, 0x00, 0x00, 0x03,
 		};
+
+		static const uint8_t qid_depth_mask[16] = {
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+			0x00, 0x00, 0x00, 0x06,
+		};
+		const __m128i v_qid_depth_mask  = _mm_loadu_si128(
+						  (const __m128i *)qid_depth_mask