DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] event/dlb2: add support for single 512B write of 4 QEs
@ 2022-04-09 15:18 Timothy McDaniel
  2022-05-14 12:07 ` Jerin Jacob
                   ` (6 more replies)
  0 siblings, 7 replies; 20+ messages in thread
From: Timothy McDaniel @ 2022-04-09 15:18 UTC (permalink / raw)
  To: jerinj; +Cc: dev

On Xeon, as 512b accesses are available, movdir64 instruction is able to
perform 512b read and write to DLB producer port. In order for movdir64
to be able to pull its data from store buffers (store-buffer-forwarding)
(before actual write), data should be in single 512b write format.
This commit add change when code is built for Xeon with 512b AVX support
to make single 512b write of all 4 QEs instead of 4x64b writes.

Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
---
 drivers/event/dlb2/dlb2.c | 86 ++++++++++++++++++++++++++++++---------
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 36f07d0061..e2a5303310 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -2776,25 +2776,73 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
 						ev[3].event_type,
 					     DLB2_QE_EV_TYPE_WORD + 4);
 
-		/* Store the metadata to memory (use the double-precision
-		 * _mm_storeh_pd because there is no integer function for
-		 * storing the upper 64b):
-		 * qe[0] metadata = sse_qe[0][63:0]
-		 * qe[1] metadata = sse_qe[0][127:64]
-		 * qe[2] metadata = sse_qe[1][63:0]
-		 * qe[3] metadata = sse_qe[1][127:64]
-		 */
-		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
-		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
-			      (__m128d)sse_qe[0]);
-		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
-		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
-			      (__m128d)sse_qe[1]);
-
-		qe[0].data = ev[0].u64;
-		qe[1].data = ev[1].u64;
-		qe[2].data = ev[2].u64;
-		qe[3].data = ev[3].u64;
+ #ifdef __AVX512VL__
+
+			/*
+			 * 1) Build avx512 QE store and build each
+			 *    QE individually as XMM register
+			 * 2) Merge the 4 XMM registers/QEs into single AVX512
+			 *    register
+			 * 3) Store single avx512 register to &qe[0] (4x QEs
+			 *    stored in 1x store)
+			 */
+
+			__m128i v_qe0 = _mm_setzero_si128();
+			uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+			__m128i v_qe1 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[0], 1);
+			v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+			v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+			__m128i v_qe2 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+			__m128i v_qe3 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 1);
+			v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+			v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+			/* we have 4x XMM registers, one per QE. */
+			__m512i v_all_qes = _mm512_setzero_si512();
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+			/*
+			 * store the 4x QEs in a single register to the scratch
+			 * space of the PMD
+			 */
+			_mm512_store_si512(&qe[0], v_all_qes);
+#else
+			/*
+			 * Store the metadata to memory (use the double-precision
+			 * _mm_storeh_pd because there is no integer function for
+			 * storing the upper 64b):
+			 * qe[0] metadata = sse_qe[0][63:0]
+			 * qe[1] metadata = sse_qe[0][127:64]
+			 * qe[2] metadata = sse_qe[1][63:0]
+			 * qe[3] metadata = sse_qe[1][127:64]
+			 */
+			_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+					 sse_qe[0]);
+			_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+				      (__m128d)sse_qe[0]);
+			_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+					 sse_qe[1]);
+			_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+			qe[0].data = ev[0].u64;
+			qe[1].data = ev[1].u64;
+			qe[2].data = ev[2].u64;
+			qe[3].data = ev[3].u64;
+#endif
 
 		break;
 	case 3:
-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] event/dlb2: add support for single 512B write of 4 QEs
  2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
@ 2022-05-14 12:07 ` Jerin Jacob
  2022-05-16  8:42   ` Bruce Richardson
  2022-05-16 17:00   ` McDaniel, Timothy
  2022-05-19 20:24 ` [PATCH v3] " Timothy McDaniel
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 20+ messages in thread
From: Jerin Jacob @ 2022-05-14 12:07 UTC (permalink / raw)
  To: Timothy McDaniel, Richardson, Bruce, konstantin.v.ananyev
  Cc: Jerin Jacob, dpdk-dev

On Sat, Apr 9, 2022 at 8:48 PM Timothy McDaniel
<timothy.mcdaniel@intel.com> wrote:
>
> On Xeon, as 512b accesses are available, movdir64 instruction is able to
> perform 512b read and write to DLB producer port. In order for movdir64
> to be able to pull its data from store buffers (store-buffer-forwarding)
> (before actual write), data should be in single 512b write format.
> This commit add change when code is built for Xeon with 512b AVX support
> to make single 512b write of all 4 QEs instead of 4x64b writes.
>
> Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> ---
>  drivers/event/dlb2/dlb2.c | 86 ++++++++++++++++++++++++++++++---------
>  1 file changed, 67 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> index 36f07d0061..e2a5303310 100644
> --- a/drivers/event/dlb2/dlb2.c
> +++ b/drivers/event/dlb2/dlb2.c
> @@ -2776,25 +2776,73 @@ dlb2_event_build_hcws(struct dlb2_port *qm_port,
>                                                 ev[3].event_type,
>                                              DLB2_QE_EV_TYPE_WORD + 4);
>
> -               /* Store the metadata to memory (use the double-precision
> -                * _mm_storeh_pd because there is no integer function for
> -                * storing the upper 64b):
> -                * qe[0] metadata = sse_qe[0][63:0]
> -                * qe[1] metadata = sse_qe[0][127:64]
> -                * qe[2] metadata = sse_qe[1][63:0]
> -                * qe[3] metadata = sse_qe[1][127:64]
> -                */
> -               _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
> -               _mm_storeh_pd((double *)&qe[1].u.opaque_data,
> -                             (__m128d)sse_qe[0]);
> -               _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
> -               _mm_storeh_pd((double *)&qe[3].u.opaque_data,
> -                             (__m128d)sse_qe[1]);
> -
> -               qe[0].data = ev[0].u64;
> -               qe[1].data = ev[1].u64;
> -               qe[2].data = ev[2].u64;
> -               qe[3].data = ev[3].u64;
> + #ifdef __AVX512VL__

+ x86 maintainers

We need a runtime check based on CPU flags. Right? As the build and
run machine can be different?

> +
> +                       /*
> +                        * 1) Build avx512 QE store and build each
> +                        *    QE individually as XMM register
> +                        * 2) Merge the 4 XMM registers/QEs into single AVX512
> +                        *    register
> +                        * 3) Store single avx512 register to &qe[0] (4x QEs
> +                        *    stored in 1x store)
> +                        */
> +
> +                       __m128i v_qe0 = _mm_setzero_si128();
> +                       uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
> +                       v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
> +                       v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
> +
> +                       __m128i v_qe1 = _mm_setzero_si128();
> +                       meta = _mm_extract_epi64(sse_qe[0], 1);
> +                       v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
> +                       v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
> +
> +                       __m128i v_qe2 = _mm_setzero_si128();
> +                       meta = _mm_extract_epi64(sse_qe[1], 0);
> +                       v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
> +                       v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
> +
> +                       __m128i v_qe3 = _mm_setzero_si128();
> +                       meta = _mm_extract_epi64(sse_qe[1], 1);
> +                       v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
> +                       v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
> +
> +                       /* we have 4x XMM registers, one per QE. */
> +                       __m512i v_all_qes = _mm512_setzero_si512();
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
> +
> +                       /*
> +                        * store the 4x QEs in a single register to the scratch
> +                        * space of the PMD
> +                        */
> +                       _mm512_store_si512(&qe[0], v_all_qes);
> +#else
> +                       /*
> +                        * Store the metadata to memory (use the double-precision
> +                        * _mm_storeh_pd because there is no integer function for
> +                        * storing the upper 64b):
> +                        * qe[0] metadata = sse_qe[0][63:0]
> +                        * qe[1] metadata = sse_qe[0][127:64]
> +                        * qe[2] metadata = sse_qe[1][63:0]
> +                        * qe[3] metadata = sse_qe[1][127:64]
> +                        */
> +                       _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
> +                                        sse_qe[0]);
> +                       _mm_storeh_pd((double *)&qe[1].u.opaque_data,
> +                                     (__m128d)sse_qe[0]);
> +                       _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
> +                                        sse_qe[1]);
> +                       _mm_storeh_pd((double *)&qe[3].u.opaque_data,
> +                                     (__m128d)sse_qe[1]);
> +
> +                       qe[0].data = ev[0].u64;
> +                       qe[1].data = ev[1].u64;
> +                       qe[2].data = ev[2].u64;
> +                       qe[3].data = ev[3].u64;
> +#endif
>
>                 break;
>         case 3:
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-14 12:07 ` Jerin Jacob
@ 2022-05-16  8:42   ` Bruce Richardson
  2022-05-16 17:00   ` McDaniel, Timothy
  1 sibling, 0 replies; 20+ messages in thread
From: Bruce Richardson @ 2022-05-16  8:42 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: Timothy McDaniel, konstantin.v.ananyev, Jerin Jacob, dpdk-dev

On Sat, May 14, 2022 at 05:37:39PM +0530, Jerin Jacob wrote:
> On Sat, Apr 9, 2022 at 8:48 PM Timothy McDaniel
> <timothy.mcdaniel@intel.com> wrote:
> >
> > On Xeon, as 512b accesses are available, movdir64 instruction is able
> > to perform 512b read and write to DLB producer port. In order for
> > movdir64 to be able to pull its data from store buffers
> > (store-buffer-forwarding) (before actual write), data should be in
> > single 512b write format.  This commit add change when code is built
> > for Xeon with 512b AVX support to make single 512b write of all 4 QEs
> > instead of 4x64b writes.
> >
> > Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com> ---
> > drivers/event/dlb2/dlb2.c | 86 ++++++++++++++++++++++++++++++---------
> > 1 file changed, 67 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> > index 36f07d0061..e2a5303310 100644 --- a/drivers/event/dlb2/dlb2.c +++
> > b/drivers/event/dlb2/dlb2.c @@ -2776,25 +2776,73 @@
> > dlb2_event_build_hcws(struct dlb2_port *qm_port, ev[3].event_type,
> > DLB2_QE_EV_TYPE_WORD + 4);
> >
> > -               /* Store the metadata to memory (use the
> > double-precision -                * _mm_storeh_pd because there is no
> > integer function for -                * storing the upper 64b): -
> > * qe[0] metadata = sse_qe[0][63:0] -                * qe[1] metadata =
> > sse_qe[0][127:64] -                * qe[2] metadata = sse_qe[1][63:0] -
> > * qe[3] metadata = sse_qe[1][127:64] -                */ -
> > _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]); -
> > _mm_storeh_pd((double *)&qe[1].u.opaque_data, -
> > (__m128d)sse_qe[0]); -               _mm_storel_epi64((__m128i
> > *)&qe[2].u.opaque_data, sse_qe[1]); -
> > _mm_storeh_pd((double *)&qe[3].u.opaque_data, -
> > (__m128d)sse_qe[1]); - -               qe[0].data = ev[0].u64; -
> > qe[1].data = ev[1].u64; -               qe[2].data = ev[2].u64; -
> > qe[3].data = ev[3].u64; + #ifdef __AVX512VL__
> 
> + x86 maintainers
> 
> We need a runtime check based on CPU flags. Right? As the build and run
> machine can be different?
> 
Ideally, yes, this should be a run-time decision. There are quite a number
of examples of this in DPDK. However, most uses of runtime decisions are in
functions called via function pointer, so not sure if those schemes apply
here. It's certainly worth investigating, though.

/Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-14 12:07 ` Jerin Jacob
  2022-05-16  8:42   ` Bruce Richardson
@ 2022-05-16 17:00   ` McDaniel, Timothy
  1 sibling, 0 replies; 20+ messages in thread
From: McDaniel, Timothy @ 2022-05-16 17:00 UTC (permalink / raw)
  To: Jerin Jacob, Richardson, Bruce, konstantin.v.ananyev
  Cc: Jerin Jacob, dpdk-dev, Pathak, Pravin



> -----Original Message-----
> From: Jerin Jacob <jerinjacobk@gmail.com>
> Sent: Saturday, May 14, 2022 7:08 AM
> To: McDaniel, Timothy <timothy.mcdaniel@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; konstantin.v.ananyev@yandex.ru
> Cc: Jerin Jacob <jerinj@marvell.com>; dpdk-dev <dev@dpdk.org>
> Subject: Re: [PATCH] event/dlb2: add support for single 512B write of 4 QEs
> 
> On Sat, Apr 9, 2022 at 8:48 PM Timothy McDaniel
> <timothy.mcdaniel@intel.com> wrote:
> >
> > On Xeon, as 512b accesses are available, movdir64 instruction is able to
> > perform 512b read and write to DLB producer port. In order for movdir64
> > to be able to pull its data from store buffers (store-buffer-forwarding)
> > (before actual write), data should be in single 512b write format.
> > This commit add change when code is built for Xeon with 512b AVX support
> > to make single 512b write of all 4 QEs instead of 4x64b writes.
> >
> > Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> > ---
> >  drivers/event/dlb2/dlb2.c | 86 ++++++++++++++++++++++++++++++---------
> >  1 file changed, 67 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> > index 36f07d0061..e2a5303310 100644
> > --- a/drivers/event/dlb2/dlb2.c
> > +++ b/drivers/event/dlb2/dlb2.c
> > @@ -2776,25 +2776,73 @@ dlb2_event_build_hcws(struct dlb2_port
> *qm_port,
> >                                                 ev[3].event_type,
> >                                              DLB2_QE_EV_TYPE_WORD + 4);
> >
> > -               /* Store the metadata to memory (use the double-precision
> > -                * _mm_storeh_pd because there is no integer function for
> > -                * storing the upper 64b):
> > -                * qe[0] metadata = sse_qe[0][63:0]
> > -                * qe[1] metadata = sse_qe[0][127:64]
> > -                * qe[2] metadata = sse_qe[1][63:0]
> > -                * qe[3] metadata = sse_qe[1][127:64]
> > -                */
> > -               _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
> > -               _mm_storeh_pd((double *)&qe[1].u.opaque_data,
> > -                             (__m128d)sse_qe[0]);
> > -               _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
> > -               _mm_storeh_pd((double *)&qe[3].u.opaque_data,
> > -                             (__m128d)sse_qe[1]);
> > -
> > -               qe[0].data = ev[0].u64;
> > -               qe[1].data = ev[1].u64;
> > -               qe[2].data = ev[2].u64;
> > -               qe[3].data = ev[3].u64;
> > + #ifdef __AVX512VL__
> 
> + x86 maintainers
> 
> We need a runtime check based on CPU flags. Right? As the build and
> run machine can be different?

Thanks Jerin. I will convert to a runtime check.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3] event/dlb2: add support for single 512B write of 4 QEs
  2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
  2022-05-14 12:07 ` Jerin Jacob
@ 2022-05-19 20:24 ` Timothy McDaniel
  2022-05-23 16:09 ` [PATCH v4] " Timothy McDaniel
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 20+ messages in thread
From: Timothy McDaniel @ 2022-05-19 20:24 UTC (permalink / raw)
  To: jerinj; +Cc: bruce.richardson, dev, Kent Wires

On Xeon, as 512b accesses are available, movdir64 instruction is able to
perform 512b read and write to DLB producer port. In order for movdir64
to be able to pull its data from store buffers (store-buffer-forwarding)
(before actual write), data should be in single 512b write format.
This commit add change when code is built for Xeon with 512b AVX support
to make single 512b write of all 4 QEs instead of 4x64b writes.

Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
Acked-by: Kent Wires <kent.wires@intel.com>
===

Changes since V1:
1) Split out dlb2_event_build_hcws into two implementations, one
that uses AVX512 instructions, and one that does not. Each implementation
is in its own source file in order to avoid build errors if the compiler
does not support the newer AVX512 instructions.
2) Update meson.build to and pull in appropriate source file based on
whether the compiler supports AVX512VL
3) Check if target supports AVX512VL, and use appropriate implementation
based on this runtime check.
---
 drivers/event/dlb2/dlb2.c          | 206 +---------------------
 drivers/event/dlb2/dlb2_avx512.c   | 267 +++++++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_noavx512.c | 219 +++++++++++++++++++++++
 drivers/event/dlb2/dlb2_priv.h     |   8 +
 drivers/event/dlb2/meson.build     |  14 ++
 5 files changed, 513 insertions(+), 201 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_avx512.c
 create mode 100644 drivers/event/dlb2/dlb2_noavx512.c

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 36f07d0061..ac7572a28d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -1834,6 +1834,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 
 	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
 
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL))
+		ev_port->qm_port.use_avx512 = true;
+	else
+		ev_port->qm_port.use_avx512 = false;
+
 	return 0;
 }
 
@@ -2430,21 +2435,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
 	return 0;
 }
 
-static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
-	{
-		/* Load-balanced cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
-	},
-	{
-		/* Directed cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
-	},
-};
-
 static inline uint32_t
 dlb2_port_credits_get(struct dlb2_port *qm_port,
 		      enum dlb2_hw_queue_types type)
@@ -2639,192 +2629,6 @@ dlb2_construct_token_pop_qe(struct dlb2_port *qm_port, int idx)
 	qm_port->owed_tokens = 0;
 }
 
-static inline void
-dlb2_event_build_hcws(struct dlb2_port *qm_port,
-		      const struct rte_event ev[],
-		      int num,
-		      uint8_t *sched_type,
-		      uint8_t *queue_id)
-{
-	struct dlb2_enqueue_qe *qe;
-	uint16_t sched_word[4];
-	__m128i sse_qe[2];
-	int i;
-
-	qe = qm_port->qe4;
-
-	sse_qe[0] = _mm_setzero_si128();
-	sse_qe[1] = _mm_setzero_si128();
-
-	switch (num) {
-	case 4:
-		/* Construct the metadata portion of two HCWs in one 128b SSE
-		 * register. HCW metadata is constructed in the SSE registers
-		 * like so:
-		 * sse_qe[0][63:0]:   qe[0]'s metadata
-		 * sse_qe[0][127:64]: qe[1]'s metadata
-		 * sse_qe[1][63:0]:   qe[2]'s metadata
-		 * sse_qe[1][127:64]: qe[3]'s metadata
-		 */
-
-		/* Convert the event operation into a command byte and store it
-		 * in the metadata:
-		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
-		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
-		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
-		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
-		 */
-#define DLB2_QE_CMD_BYTE 7
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[0].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[1].op],
-				DLB2_QE_CMD_BYTE + 8);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[2].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[3].op],
-				DLB2_QE_CMD_BYTE + 8);
-
-		/* Store priority, scheduling type, and queue ID in the sched
-		 * word array because these values are re-used when the
-		 * destination is a directed queue.
-		 */
-		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
-				sched_type[0] << 8 |
-				queue_id[0];
-		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
-				sched_type[1] << 8 |
-				queue_id[1];
-		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
-				sched_type[2] << 8 |
-				queue_id[2];
-		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
-				sched_type[3] << 8 |
-				queue_id[3];
-
-		/* Store the event priority, scheduling type, and queue ID in
-		 * the metadata:
-		 * sse_qe[0][31:16] = sched_word[0]
-		 * sse_qe[0][95:80] = sched_word[1]
-		 * sse_qe[1][31:16] = sched_word[2]
-		 * sse_qe[1][95:80] = sched_word[3]
-		 */
-#define DLB2_QE_QID_SCHED_WORD 1
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[0],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[1],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[2],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[3],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-
-		/* If the destination is a load-balanced queue, store the lock
-		 * ID. If it is a directed queue, DLB places this field in
-		 * bytes 10-11 of the received QE, so we format it accordingly:
-		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
-		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
-		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
-		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
-		 */
-#define DLB2_QE_LOCK_ID_WORD 2
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
-					sched_word[0] : ev[0].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
-					sched_word[1] : ev[1].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
-					sched_word[2] : ev[2].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
-					sched_word[3] : ev[3].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-
-		/* Store the event type and sub event type in the metadata:
-		 * sse_qe[0][15:0]  = flow_id[0]
-		 * sse_qe[0][79:64] = flow_id[1]
-		 * sse_qe[1][15:0]  = flow_id[2]
-		 * sse_qe[1][79:64] = flow_id[3]
-		 */
-#define DLB2_QE_EV_TYPE_WORD 0
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-
-		/* Store the metadata to memory (use the double-precision
-		 * _mm_storeh_pd because there is no integer function for
-		 * storing the upper 64b):
-		 * qe[0] metadata = sse_qe[0][63:0]
-		 * qe[1] metadata = sse_qe[0][127:64]
-		 * qe[2] metadata = sse_qe[1][63:0]
-		 * qe[3] metadata = sse_qe[1][127:64]
-		 */
-		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
-		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
-			      (__m128d)sse_qe[0]);
-		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
-		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
-			      (__m128d)sse_qe[1]);
-
-		qe[0].data = ev[0].u64;
-		qe[1].data = ev[1].u64;
-		qe[2].data = ev[2].u64;
-		qe[3].data = ev[3].u64;
-
-		break;
-	case 3:
-	case 2:
-	case 1:
-		for (i = 0; i < num; i++) {
-			qe[i].cmd_byte =
-				cmd_byte_map[qm_port->is_directed][ev[i].op];
-			qe[i].sched_type = sched_type[i];
-			qe[i].data = ev[i].u64;
-			qe[i].qid = queue_id[i];
-			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
-			qe[i].lock_id = ev[i].flow_id;
-			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
-				struct dlb2_msg_info *info =
-					(struct dlb2_msg_info *)&qe[i].lock_id;
-
-				info->qid = queue_id[i];
-				info->sched_type = DLB2_SCHED_DIRECTED;
-				info->priority = qe[i].priority;
-			}
-			qe[i].u.event_type.major = ev[i].event_type;
-			qe[i].u.event_type.sub = ev[i].sub_event_type;
-		}
-		break;
-	case 0:
-		break;
-	}
-}
-
 static inline int
 dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 			struct dlb2_port *qm_port,
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
new file mode 100644
index 0000000000..ce2d006006
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is used when the compiler on the build machine
+ * supports AVX512VL. We will perform a runtime check before actually
+ * executing those instructions.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		if (qm_port->use_avx512) {
+
+			/*
+			 * 1) Build avx512 QE store and build each
+			 *    QE individually as XMM register
+			 * 2) Merge the 4 XMM registers/QEs into single AVX512
+			 *    register
+			 * 3) Store single avx512 register to &qe[0] (4x QEs
+			 *    stored in 1x store)
+			 */
+
+			__m128i v_qe0 = _mm_setzero_si128();
+			uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+			__m128i v_qe1 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[0], 1);
+			v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+			v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+			__m128i v_qe2 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+			__m128i v_qe3 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 1);
+			v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+			v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+			/* we have 4x XMM registers, one per QE. */
+			__m512i v_all_qes = _mm512_setzero_si512();
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+			/*
+			 * store the 4x QEs in a single register to the scratch
+			 * space of the PMD
+			 */
+			_mm512_store_si512(&qe[0], v_all_qes);
+
+		} else {
+
+			/*
+			 * Store the metadata to memory (use the double-precision
+			 * _mm_storeh_pd because there is no integer function for
+			 * storing the upper 64b):
+			 * qe[0] metadata = sse_qe[0][63:0]
+			 * qe[1] metadata = sse_qe[0][127:64]
+			 * qe[2] metadata = sse_qe[1][63:0]
+			 * qe[3] metadata = sse_qe[1][127:64]
+			 */
+			_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+					 sse_qe[0]);
+			_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+				      (__m128d)sse_qe[0]);
+			_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+					 sse_qe[1]);
+			_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+			qe[0].data = ev[0].u64;
+			qe[1].data = ev[1].u64;
+			qe[2].data = ev[2].u64;
+			qe[3].data = ev[3].u64;
+		}
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_noavx512.c b/drivers/event/dlb2/dlb2_noavx512.c
new file mode 100644
index 0000000000..82f6588e2a
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_noavx512.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 3e47e4776b..b02d28467f 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -377,6 +377,7 @@ struct dlb2_port {
 	struct dlb2_eventdev_port *ev_port; /* back ptr */
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
+	bool use_avx512;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -685,6 +686,13 @@ int dlb2_parse_params(const char *params,
 		      struct dlb2_devargs *dlb2_args,
 		      uint8_t version);
 
+void dlb2_event_build_hcws(struct dlb2_port *qm_port,
+			   const struct rte_event ev[],
+			   int num,
+			   uint8_t *sched_type,
+			   uint8_t *queue_id);
+
+
 /* Extern globals */
 extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
 
diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
index f963589fd3..efd93c7093 100644
--- a/drivers/event/dlb2/meson.build
+++ b/drivers/event/dlb2/meson.build
@@ -19,6 +19,20 @@ sources = files(
         'dlb2_selftest.c',
 )
 
+dlb2_avx512_support = false
+
+if dpdk_conf.has('RTE_ARCH_X86_64')
+        dlb2_avx512_support = (
+	    cc.get_define('__AVX512VL__', args: machine_args) != ''
+	)
+endif
+
+if dlb2_avx512_support == true
+        sources += files('dlb2_avx512.c')
+else
+        sources += files('dlb2_noavx512.c')
+endif
+
 headers = files('rte_pmd_dlb2.h')
 
 deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
  2022-05-14 12:07 ` Jerin Jacob
  2022-05-19 20:24 ` [PATCH v3] " Timothy McDaniel
@ 2022-05-23 16:09 ` Timothy McDaniel
  2022-05-23 16:34   ` Bruce Richardson
  2022-05-23 16:37   ` Bruce Richardson
  2022-06-10 12:43 ` [PATCH v6] " Timothy McDaniel
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 20+ messages in thread
From: Timothy McDaniel @ 2022-05-23 16:09 UTC (permalink / raw)
  To: jerinj; +Cc: bruce.richardson, dev, Kent Wires

On Xeon, as 512b accesses are available, movdir64 instruction is able to
perform 512b read and write to DLB producer port. In order for movdir64
to be able to pull its data from store buffers (store-buffer-forwarding)
(before actual write), data should be in single 512b write format.
This commit add change when code is built for Xeon with 512b AVX support
to make single 512b write of all 4 QEs instead of 4x64b writes.

Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
Acked-by: Kent Wires <kent.wires@intel.com>
===

Changes since V3:
1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
for new file name.

Changes since V1:
1) Split out dlb2_event_build_hcws into two implementations, one
that uses AVX512 instructions, and one that does not. Each implementation
is in its own source file in order to avoid build errors if the compiler
does not support the newer AVX512 instructions.
2) Update meson.build to and pull in appropriate source file based on
whether the compiler supports AVX512VL
3) Check if target supports AVX512VL, and use appropriate implementation
based on this runtime check.
---
 drivers/event/dlb2/dlb2.c        | 206 +-----------------------
 drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_priv.h   |   8 +
 drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
 drivers/event/dlb2/meson.build   |  14 ++
 5 files changed, 513 insertions(+), 201 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_avx512.c
 create mode 100644 drivers/event/dlb2/dlb2_sve.c

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 36f07d0061..ac7572a28d 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -1834,6 +1834,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 
 	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
 
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL))
+		ev_port->qm_port.use_avx512 = true;
+	else
+		ev_port->qm_port.use_avx512 = false;
+
 	return 0;
 }
 
@@ -2430,21 +2435,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
 	return 0;
 }
 
-static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
-	{
-		/* Load-balanced cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
-	},
-	{
-		/* Directed cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
-	},
-};
-
 static inline uint32_t
 dlb2_port_credits_get(struct dlb2_port *qm_port,
 		      enum dlb2_hw_queue_types type)
@@ -2639,192 +2629,6 @@ dlb2_construct_token_pop_qe(struct dlb2_port *qm_port, int idx)
 	qm_port->owed_tokens = 0;
 }
 
-static inline void
-dlb2_event_build_hcws(struct dlb2_port *qm_port,
-		      const struct rte_event ev[],
-		      int num,
-		      uint8_t *sched_type,
-		      uint8_t *queue_id)
-{
-	struct dlb2_enqueue_qe *qe;
-	uint16_t sched_word[4];
-	__m128i sse_qe[2];
-	int i;
-
-	qe = qm_port->qe4;
-
-	sse_qe[0] = _mm_setzero_si128();
-	sse_qe[1] = _mm_setzero_si128();
-
-	switch (num) {
-	case 4:
-		/* Construct the metadata portion of two HCWs in one 128b SSE
-		 * register. HCW metadata is constructed in the SSE registers
-		 * like so:
-		 * sse_qe[0][63:0]:   qe[0]'s metadata
-		 * sse_qe[0][127:64]: qe[1]'s metadata
-		 * sse_qe[1][63:0]:   qe[2]'s metadata
-		 * sse_qe[1][127:64]: qe[3]'s metadata
-		 */
-
-		/* Convert the event operation into a command byte and store it
-		 * in the metadata:
-		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
-		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
-		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
-		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
-		 */
-#define DLB2_QE_CMD_BYTE 7
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[0].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[1].op],
-				DLB2_QE_CMD_BYTE + 8);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[2].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[3].op],
-				DLB2_QE_CMD_BYTE + 8);
-
-		/* Store priority, scheduling type, and queue ID in the sched
-		 * word array because these values are re-used when the
-		 * destination is a directed queue.
-		 */
-		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
-				sched_type[0] << 8 |
-				queue_id[0];
-		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
-				sched_type[1] << 8 |
-				queue_id[1];
-		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
-				sched_type[2] << 8 |
-				queue_id[2];
-		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
-				sched_type[3] << 8 |
-				queue_id[3];
-
-		/* Store the event priority, scheduling type, and queue ID in
-		 * the metadata:
-		 * sse_qe[0][31:16] = sched_word[0]
-		 * sse_qe[0][95:80] = sched_word[1]
-		 * sse_qe[1][31:16] = sched_word[2]
-		 * sse_qe[1][95:80] = sched_word[3]
-		 */
-#define DLB2_QE_QID_SCHED_WORD 1
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[0],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[1],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[2],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[3],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-
-		/* If the destination is a load-balanced queue, store the lock
-		 * ID. If it is a directed queue, DLB places this field in
-		 * bytes 10-11 of the received QE, so we format it accordingly:
-		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
-		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
-		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
-		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
-		 */
-#define DLB2_QE_LOCK_ID_WORD 2
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
-					sched_word[0] : ev[0].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
-					sched_word[1] : ev[1].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
-					sched_word[2] : ev[2].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
-					sched_word[3] : ev[3].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-
-		/* Store the event type and sub event type in the metadata:
-		 * sse_qe[0][15:0]  = flow_id[0]
-		 * sse_qe[0][79:64] = flow_id[1]
-		 * sse_qe[1][15:0]  = flow_id[2]
-		 * sse_qe[1][79:64] = flow_id[3]
-		 */
-#define DLB2_QE_EV_TYPE_WORD 0
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-
-		/* Store the metadata to memory (use the double-precision
-		 * _mm_storeh_pd because there is no integer function for
-		 * storing the upper 64b):
-		 * qe[0] metadata = sse_qe[0][63:0]
-		 * qe[1] metadata = sse_qe[0][127:64]
-		 * qe[2] metadata = sse_qe[1][63:0]
-		 * qe[3] metadata = sse_qe[1][127:64]
-		 */
-		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
-		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
-			      (__m128d)sse_qe[0]);
-		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
-		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
-			      (__m128d)sse_qe[1]);
-
-		qe[0].data = ev[0].u64;
-		qe[1].data = ev[1].u64;
-		qe[2].data = ev[2].u64;
-		qe[3].data = ev[3].u64;
-
-		break;
-	case 3:
-	case 2:
-	case 1:
-		for (i = 0; i < num; i++) {
-			qe[i].cmd_byte =
-				cmd_byte_map[qm_port->is_directed][ev[i].op];
-			qe[i].sched_type = sched_type[i];
-			qe[i].data = ev[i].u64;
-			qe[i].qid = queue_id[i];
-			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
-			qe[i].lock_id = ev[i].flow_id;
-			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
-				struct dlb2_msg_info *info =
-					(struct dlb2_msg_info *)&qe[i].lock_id;
-
-				info->qid = queue_id[i];
-				info->sched_type = DLB2_SCHED_DIRECTED;
-				info->priority = qe[i].priority;
-			}
-			qe[i].u.event_type.major = ev[i].event_type;
-			qe[i].u.event_type.sub = ev[i].sub_event_type;
-		}
-		break;
-	case 0:
-		break;
-	}
-}
-
 static inline int
 dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 			struct dlb2_port *qm_port,
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
new file mode 100644
index 0000000000..ce2d006006
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is used when the compiler on the build machine
+ * supports AVX512VL. We will perform a runtime check before actually
+ * executing those instructions.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		if (qm_port->use_avx512) {
+
+			/*
+			 * 1) Build avx512 QE store and build each
+			 *    QE individually as XMM register
+			 * 2) Merge the 4 XMM registers/QEs into single AVX512
+			 *    register
+			 * 3) Store single avx512 register to &qe[0] (4x QEs
+			 *    stored in 1x store)
+			 */
+
+			__m128i v_qe0 = _mm_setzero_si128();
+			uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+			__m128i v_qe1 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[0], 1);
+			v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+			v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+			__m128i v_qe2 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+			__m128i v_qe3 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 1);
+			v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+			v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+			/* we have 4x XMM registers, one per QE. */
+			__m512i v_all_qes = _mm512_setzero_si512();
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+			/*
+			 * store the 4x QEs in a single register to the scratch
+			 * space of the PMD
+			 */
+			_mm512_store_si512(&qe[0], v_all_qes);
+
+		} else {
+
+			/*
+			 * Store the metadata to memory (use the double-precision
+			 * _mm_storeh_pd because there is no integer function for
+			 * storing the upper 64b):
+			 * qe[0] metadata = sse_qe[0][63:0]
+			 * qe[1] metadata = sse_qe[0][127:64]
+			 * qe[2] metadata = sse_qe[1][63:0]
+			 * qe[3] metadata = sse_qe[1][127:64]
+			 */
+			_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+					 sse_qe[0]);
+			_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+				      (__m128d)sse_qe[0]);
+			_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+					 sse_qe[1]);
+			_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+			qe[0].data = ev[0].u64;
+			qe[1].data = ev[1].u64;
+			qe[2].data = ev[2].u64;
+			qe[3].data = ev[3].u64;
+		}
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 3e47e4776b..b02d28467f 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -377,6 +377,7 @@ struct dlb2_port {
 	struct dlb2_eventdev_port *ev_port; /* back ptr */
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
+	bool use_avx512;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -685,6 +686,13 @@ int dlb2_parse_params(const char *params,
 		      struct dlb2_devargs *dlb2_args,
 		      uint8_t version);
 
+void dlb2_event_build_hcws(struct dlb2_port *qm_port,
+			   const struct rte_event ev[],
+			   int num,
+			   uint8_t *sched_type,
+			   uint8_t *queue_id);
+
+
 /* Extern globals */
 extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
 
diff --git a/drivers/event/dlb2/dlb2_sve.c b/drivers/event/dlb2/dlb2_sve.c
new file mode 100644
index 0000000000..82f6588e2a
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_sve.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
index f963589fd3..0ad4d31785 100644
--- a/drivers/event/dlb2/meson.build
+++ b/drivers/event/dlb2/meson.build
@@ -19,6 +19,20 @@ sources = files(
         'dlb2_selftest.c',
 )
 
+dlb2_avx512_support = false
+
+if dpdk_conf.has('RTE_ARCH_X86_64')
+        dlb2_avx512_support = (
+	    cc.get_define('__AVX512VL__', args: machine_args) != ''
+	)
+endif
+
+if dlb2_avx512_support == true
+        sources += files('dlb2_avx512.c')
+else
+        sources += files('dlb2_sve.c')
+endif
+
 headers = files('rte_pmd_dlb2.h')
 
 deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-23 16:09 ` [PATCH v4] " Timothy McDaniel
@ 2022-05-23 16:34   ` Bruce Richardson
  2022-05-23 16:52     ` McDaniel, Timothy
  2022-05-23 16:37   ` Bruce Richardson
  1 sibling, 1 reply; 20+ messages in thread
From: Bruce Richardson @ 2022-05-23 16:34 UTC (permalink / raw)
  To: Timothy McDaniel; +Cc: jerinj, dev, Kent Wires

On Mon, May 23, 2022 at 11:09:55AM -0500, Timothy McDaniel wrote:
> On Xeon, as 512b accesses are available, movdir64 instruction is able to
> perform 512b read and write to DLB producer port. In order for movdir64
> to be able to pull its data from store buffers (store-buffer-forwarding)
> (before actual write), data should be in single 512b write format.
> This commit add change when code is built for Xeon with 512b AVX support
> to make single 512b write of all 4 QEs instead of 4x64b writes.
> 
> Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> Acked-by: Kent Wires <kent.wires@intel.com>
> ===
> 
> Changes since V3:
> 1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
> for new file name.
> 
> Changes since V1:
> 1) Split out dlb2_event_build_hcws into two implementations, one
> that uses AVX512 instructions, and one that does not. Each implementation
> is in its own source file in order to avoid build errors if the compiler
> does not support the newer AVX512 instructions.
> 2) Update meson.build to and pull in appropriate source file based on
> whether the compiler supports AVX512VL
> 3) Check if target supports AVX512VL, and use appropriate implementation
> based on this runtime check.
> ---
>  drivers/event/dlb2/dlb2.c        | 206 +-----------------------
>  drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
>  drivers/event/dlb2/dlb2_priv.h   |   8 +
>  drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
>  drivers/event/dlb2/meson.build   |  14 ++
>  5 files changed, 513 insertions(+), 201 deletions(-)
>  create mode 100644 drivers/event/dlb2/dlb2_avx512.c
>  create mode 100644 drivers/event/dlb2/dlb2_sve.c
> 
<snip>
> diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
> index f963589fd3..0ad4d31785 100644
> --- a/drivers/event/dlb2/meson.build
> +++ b/drivers/event/dlb2/meson.build
> @@ -19,6 +19,20 @@ sources = files(
>          'dlb2_selftest.c',
>  )
>  
> +dlb2_avx512_support = false
> +
> +if dpdk_conf.has('RTE_ARCH_X86_64')
> +        dlb2_avx512_support = (
> +	    cc.get_define('__AVX512VL__', args: machine_args) != ''
> +	)
> +endif
> +
> +if dlb2_avx512_support == true
> +        sources += files('dlb2_avx512.c')
> +else
> +        sources += files('dlb2_sve.c')
> +endif
> +
>  headers = files('rte_pmd_dlb2.h')
>  
>  deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']

I believe this can be improved upon further, since it still does not allow
a generic build to opportunistically use the AVX-512 code path. It also
makes the runtime check largely pointless as the whole build will have been
done with global AVX-512 support, meaning that the binary likely will fail
to run if AVX-512 is not available.

Instead, I'd recommend doing as other places in DPDK - such as in ACL
library, or i40e or ice net drivers - where we not only check the current
build support, but also check the compiler support. That way, even if we
are building for e.g. a target of AVX2, we can still build the AVX-512
parts using the appropriate compiler flags, and choose them
opportunistically at runtime. See the meson.build files in any of the above
component directories for examples.

Regards,

/Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-23 16:09 ` [PATCH v4] " Timothy McDaniel
  2022-05-23 16:34   ` Bruce Richardson
@ 2022-05-23 16:37   ` Bruce Richardson
  2022-05-23 16:45     ` McDaniel, Timothy
  1 sibling, 1 reply; 20+ messages in thread
From: Bruce Richardson @ 2022-05-23 16:37 UTC (permalink / raw)
  To: Timothy McDaniel; +Cc: jerinj, dev, Kent Wires

On Mon, May 23, 2022 at 11:09:55AM -0500, Timothy McDaniel wrote:
> On Xeon, as 512b accesses are available, movdir64 instruction is able to
> perform 512b read and write to DLB producer port. In order for movdir64
> to be able to pull its data from store buffers (store-buffer-forwarding)
> (before actual write), data should be in single 512b write format.
> This commit add change when code is built for Xeon with 512b AVX support
> to make single 512b write of all 4 QEs instead of 4x64b writes.
> 
> Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> Acked-by: Kent Wires <kent.wires@intel.com>
> ===
> 
> Changes since V3:
> 1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
> for new file name.
> 
> Changes since V1:
> 1) Split out dlb2_event_build_hcws into two implementations, one
> that uses AVX512 instructions, and one that does not. Each implementation
> is in its own source file in order to avoid build errors if the compiler
> does not support the newer AVX512 instructions.
> 2) Update meson.build to and pull in appropriate source file based on
> whether the compiler supports AVX512VL
> 3) Check if target supports AVX512VL, and use appropriate implementation
> based on this runtime check.
> ---
>  drivers/event/dlb2/dlb2.c        | 206 +-----------------------
>  drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
>  drivers/event/dlb2/dlb2_priv.h   |   8 +
>  drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
>  drivers/event/dlb2/meson.build   |  14 ++
>  5 files changed, 513 insertions(+), 201 deletions(-)
>  create mode 100644 drivers/event/dlb2/dlb2_avx512.c
>  create mode 100644 drivers/event/dlb2/dlb2_sve.c
> 
> diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> index 36f07d0061..ac7572a28d 100644
> --- a/drivers/event/dlb2/dlb2.c
> +++ b/drivers/event/dlb2/dlb2.c
> @@ -1834,6 +1834,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
>  
>  	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
>  
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL))
> +		ev_port->qm_port.use_avx512 = true;
> +	else
> +		ev_port->qm_port.use_avx512 = false;
> +
>  	return 0;
>  }
>  

Additional comment for this runtime check. You also should check the
max_simd_bitwidth in DPDK i.e. the value specified with
--force-max-simd-bitwidth EAL argument, or set programmatically by the app.
This is to allow the user runtime control over when the various instruction
sets get used, and it's also very useful for testing and debugging various
code paths.

/Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-23 16:37   ` Bruce Richardson
@ 2022-05-23 16:45     ` McDaniel, Timothy
  0 siblings, 0 replies; 20+ messages in thread
From: McDaniel, Timothy @ 2022-05-23 16:45 UTC (permalink / raw)
  To: Richardson, Bruce; +Cc: jerinj, dev, Wires, Kent

Sorry Bruce, but I don't have a clue what you are talking about here.

> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Monday, May 23, 2022 11:37 AM
> To: McDaniel, Timothy <timothy.mcdaniel@intel.com>
> Cc: jerinj@marvell.com; dev@dpdk.org; Wires, Kent <kent.wires@intel.com>
> Subject: Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
> 
> On Mon, May 23, 2022 at 11:09:55AM -0500, Timothy McDaniel wrote:
> > On Xeon, as 512b accesses are available, movdir64 instruction is able to
> > perform 512b read and write to DLB producer port. In order for movdir64
> > to be able to pull its data from store buffers (store-buffer-forwarding)
> > (before actual write), data should be in single 512b write format.
> > This commit add change when code is built for Xeon with 512b AVX support
> > to make single 512b write of all 4 QEs instead of 4x64b writes.
> >
> > Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> > Acked-by: Kent Wires <kent.wires@intel.com>
> > ===
> >
> > Changes since V3:
> > 1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
> > for new file name.
> >
> > Changes since V1:
> > 1) Split out dlb2_event_build_hcws into two implementations, one
> > that uses AVX512 instructions, and one that does not. Each implementation
> > is in its own source file in order to avoid build errors if the compiler
> > does not support the newer AVX512 instructions.
> > 2) Update meson.build to and pull in appropriate source file based on
> > whether the compiler supports AVX512VL
> > 3) Check if target supports AVX512VL, and use appropriate implementation
> > based on this runtime check.
> > ---
> >  drivers/event/dlb2/dlb2.c        | 206 +-----------------------
> >  drivers/event/dlb2/dlb2_avx512.c | 267
> +++++++++++++++++++++++++++++++
> >  drivers/event/dlb2/dlb2_priv.h   |   8 +
> >  drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
> >  drivers/event/dlb2/meson.build   |  14 ++
> >  5 files changed, 513 insertions(+), 201 deletions(-)
> >  create mode 100644 drivers/event/dlb2/dlb2_avx512.c
> >  create mode 100644 drivers/event/dlb2/dlb2_sve.c
> >
> > diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> > index 36f07d0061..ac7572a28d 100644
> > --- a/drivers/event/dlb2/dlb2.c
> > +++ b/drivers/event/dlb2/dlb2.c
> > @@ -1834,6 +1834,11 @@ dlb2_eventdev_port_setup(struct rte_eventdev
> *dev,
> >
> >  	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
> >
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL))
> > +		ev_port->qm_port.use_avx512 = true;
> > +	else
> > +		ev_port->qm_port.use_avx512 = false;
> > +
> >  	return 0;
> >  }
> >
> 
> Additional comment for this runtime check. You also should check the
> max_simd_bitwidth in DPDK i.e. the value specified with
> --force-max-simd-bitwidth EAL argument, or set programmatically by the app.
> This is to allow the user runtime control over when the various instruction
> sets get used, and it's also very useful for testing and debugging various
> code paths.
> 
> /Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-23 16:34   ` Bruce Richardson
@ 2022-05-23 16:52     ` McDaniel, Timothy
  2022-05-23 16:55       ` Bruce Richardson
  0 siblings, 1 reply; 20+ messages in thread
From: McDaniel, Timothy @ 2022-05-23 16:52 UTC (permalink / raw)
  To: Richardson, Bruce; +Cc: jerinj, dev, Wires, Kent



> -----Original Message-----
> From: Richardson, Bruce <bruce.richardson@intel.com>
> Sent: Monday, May 23, 2022 11:34 AM
> To: McDaniel, Timothy <timothy.mcdaniel@intel.com>
> Cc: jerinj@marvell.com; dev@dpdk.org; Wires, Kent <kent.wires@intel.com>
> Subject: Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
> 
> On Mon, May 23, 2022 at 11:09:55AM -0500, Timothy McDaniel wrote:
> > On Xeon, as 512b accesses are available, movdir64 instruction is able to
> > perform 512b read and write to DLB producer port. In order for movdir64
> > to be able to pull its data from store buffers (store-buffer-forwarding)
> > (before actual write), data should be in single 512b write format.
> > This commit add change when code is built for Xeon with 512b AVX support
> > to make single 512b write of all 4 QEs instead of 4x64b writes.
> >
> > Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> > Acked-by: Kent Wires <kent.wires@intel.com>
> > ===
> >
> > Changes since V3:
> > 1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
> > for new file name.
> >
> > Changes since V1:
> > 1) Split out dlb2_event_build_hcws into two implementations, one
> > that uses AVX512 instructions, and one that does not. Each implementation
> > is in its own source file in order to avoid build errors if the compiler
> > does not support the newer AVX512 instructions.
> > 2) Update meson.build to and pull in appropriate source file based on
> > whether the compiler supports AVX512VL
> > 3) Check if target supports AVX512VL, and use appropriate implementation
> > based on this runtime check.
> > ---
> >  drivers/event/dlb2/dlb2.c        | 206 +-----------------------
> >  drivers/event/dlb2/dlb2_avx512.c | 267
> +++++++++++++++++++++++++++++++
> >  drivers/event/dlb2/dlb2_priv.h   |   8 +
> >  drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
> >  drivers/event/dlb2/meson.build   |  14 ++
> >  5 files changed, 513 insertions(+), 201 deletions(-)
> >  create mode 100644 drivers/event/dlb2/dlb2_avx512.c
> >  create mode 100644 drivers/event/dlb2/dlb2_sve.c
> >
> <snip>
> > diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
> > index f963589fd3..0ad4d31785 100644
> > --- a/drivers/event/dlb2/meson.build
> > +++ b/drivers/event/dlb2/meson.build
> > @@ -19,6 +19,20 @@ sources = files(
> >          'dlb2_selftest.c',
> >  )
> >
> > +dlb2_avx512_support = false
> > +
> > +if dpdk_conf.has('RTE_ARCH_X86_64')
> > +        dlb2_avx512_support = (
> > +	    cc.get_define('__AVX512VL__', args: machine_args) != ''
> > +	)
> > +endif
> > +
> > +if dlb2_avx512_support == true
> > +        sources += files('dlb2_avx512.c')
> > +else
> > +        sources += files('dlb2_sve.c')
> > +endif
> > +
> >  headers = files('rte_pmd_dlb2.h')
> >
> >  deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
> 
> I believe this can be improved upon further, since it still does not allow
> a generic build to opportunistically use the AVX-512 code path. 

What does this mean - " generic build to opportunistically use the AVX-512 code path"

It also
> makes the runtime check largely pointless as the whole build will have been
> done with global AVX-512 support, meaning that the binary likely will fail
> to run if AVX-512 is not available.

If built for avx512, then that build supports using either avx512, or not.

> 
> Instead, I'd recommend doing as other places in DPDK - such as in ACL
> library, or i40e or ice net drivers - where we not only check the current
> build support, but also check the compiler support. That way, even if we
> are building for e.g. a target of AVX2, we can still build the AVX-512
> parts using the appropriate compiler flags, and choose them
> opportunistically at runtime. 

I do not understand what you are getting at here. 

See the meson.build files in any of the above
> component directories for examples.
> 
> Regards,
> 
> /Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-23 16:52     ` McDaniel, Timothy
@ 2022-05-23 16:55       ` Bruce Richardson
  2022-06-09 17:40         ` Jerin Jacob
  0 siblings, 1 reply; 20+ messages in thread
From: Bruce Richardson @ 2022-05-23 16:55 UTC (permalink / raw)
  To: McDaniel, Timothy; +Cc: jerinj, dev, Wires, Kent

On Mon, May 23, 2022 at 05:52:06PM +0100, McDaniel, Timothy wrote:
> 
> 
> > -----Original Message-----
> > From: Richardson, Bruce <bruce.richardson@intel.com>
> > Sent: Monday, May 23, 2022 11:34 AM
> > To: McDaniel, Timothy <timothy.mcdaniel@intel.com>
> > Cc: jerinj@marvell.com; dev@dpdk.org; Wires, Kent <kent.wires@intel.com>
> > Subject: Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
> >
> > On Mon, May 23, 2022 at 11:09:55AM -0500, Timothy McDaniel wrote:
> > > On Xeon, as 512b accesses are available, movdir64 instruction is able to
> > > perform 512b read and write to DLB producer port. In order for movdir64
> > > to be able to pull its data from store buffers (store-buffer-forwarding)
> > > (before actual write), data should be in single 512b write format.
> > > This commit add change when code is built for Xeon with 512b AVX support
> > > to make single 512b write of all 4 QEs instead of 4x64b writes.
> > >
> > > Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> > > Acked-by: Kent Wires <kent.wires@intel.com>
> > > ===
> > >
> > > Changes since V3:
> > > 1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
> > > for new file name.
> > >
> > > Changes since V1:
> > > 1) Split out dlb2_event_build_hcws into two implementations, one
> > > that uses AVX512 instructions, and one that does not. Each implementation
> > > is in its own source file in order to avoid build errors if the compiler
> > > does not support the newer AVX512 instructions.
> > > 2) Update meson.build to and pull in appropriate source file based on
> > > whether the compiler supports AVX512VL
> > > 3) Check if target supports AVX512VL, and use appropriate implementation
> > > based on this runtime check.
> > > ---
> > >  drivers/event/dlb2/dlb2.c        | 206 +-----------------------
> > >  drivers/event/dlb2/dlb2_avx512.c | 267
> > +++++++++++++++++++++++++++++++
> > >  drivers/event/dlb2/dlb2_priv.h   |   8 +
> > >  drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
> > >  drivers/event/dlb2/meson.build   |  14 ++
> > >  5 files changed, 513 insertions(+), 201 deletions(-)
> > >  create mode 100644 drivers/event/dlb2/dlb2_avx512.c
> > >  create mode 100644 drivers/event/dlb2/dlb2_sve.c
> > >
> > <snip>
> > > diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
> > > index f963589fd3..0ad4d31785 100644
> > > --- a/drivers/event/dlb2/meson.build
> > > +++ b/drivers/event/dlb2/meson.build
> > > @@ -19,6 +19,20 @@ sources = files(
> > >          'dlb2_selftest.c',
> > >  )
> > >
> > > +dlb2_avx512_support = false
> > > +
> > > +if dpdk_conf.has('RTE_ARCH_X86_64')
> > > +        dlb2_avx512_support = (
> > > +       cc.get_define('__AVX512VL__', args: machine_args) != ''
> > > +   )
> > > +endif
> > > +
> > > +if dlb2_avx512_support == true
> > > +        sources += files('dlb2_avx512.c')
> > > +else
> > > +        sources += files('dlb2_sve.c')
> > > +endif
> > > +
> > >  headers = files('rte_pmd_dlb2.h')
> > >
> > >  deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
> >
> > I believe this can be improved upon further, since it still does not allow
> > a generic build to opportunistically use the AVX-512 code path.
> 
> What does this mean - " generic build to opportunistically use the AVX-512 code path"
> 
> It also
> > makes the runtime check largely pointless as the whole build will have been
> > done with global AVX-512 support, meaning that the binary likely will fail
> > to run if AVX-512 is not available.
> 
> If built for avx512, then that build supports using either avx512, or not.
> 

No, if build for AVX-512, then the compiler can use AVX-512 instructions
anywhere in the binary, so that build can only run on AVX-512 supporting
systems.

> >
> > Instead, I'd recommend doing as other places in DPDK - such as in ACL
> > library, or i40e or ice net drivers - where we not only check the current
> > build support, but also check the compiler support. That way, even if we
> > are building for e.g. a target of AVX2, we can still build the AVX-512
> > parts using the appropriate compiler flags, and choose them
> > opportunistically at runtime.
> 
> I do not understand what you are getting at here.
> 
Check out net/i40e/meson.build and hopefully things may become clearer.

/Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-05-23 16:55       ` Bruce Richardson
@ 2022-06-09 17:40         ` Jerin Jacob
  2022-06-09 18:02           ` McDaniel, Timothy
  0 siblings, 1 reply; 20+ messages in thread
From: Jerin Jacob @ 2022-06-09 17:40 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: McDaniel, Timothy, jerinj, dev, Wires, Kent

On Mon, May 23, 2022 at 10:25 PM Bruce Richardson
<bruce.richardson@intel.com> wrote:
>
> On Mon, May 23, 2022 at 05:52:06PM +0100, McDaniel, Timothy wrote:
> >
> >

> > >
> > > Instead, I'd recommend doing as other places in DPDK - such as in ACL
> > > library, or i40e or ice net drivers - where we not only check the current
> > > build support, but also check the compiler support. That way, even if we
> > > are building for e.g. a target of AVX2, we can still build the AVX-512
> > > parts using the appropriate compiler flags, and choose them
> > > opportunistically at runtime.
> >
> > I do not understand what you are getting at here.
> >
> Check out net/i40e/meson.build and hopefully things may become clearer.

I am updating the patchwork status as "Changes requested" for this patch.

@McDaniel, Timothy  Please send the updated version. Also, the
following patch depends on this. I will wait for the merging below
after this patch is merged.

https://patches.dpdk.org/project/dpdk/patch/20220410225602.1524724-1-timothy.mcdaniel@intel.com/
https://patches.dpdk.org/project/dpdk/patch/20220410224755.1524117-1-timothy.mcdaniel@intel.com/


>
> /Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
  2022-06-09 17:40         ` Jerin Jacob
@ 2022-06-09 18:02           ` McDaniel, Timothy
  0 siblings, 0 replies; 20+ messages in thread
From: McDaniel, Timothy @ 2022-06-09 18:02 UTC (permalink / raw)
  To: Jerin Jacob, Richardson, Bruce; +Cc: jerinj, dev, Wires, Kent

I will submit the changes requested by Bruce.

Thanks,
Tim

> -----Original Message-----
> From: Jerin Jacob <jerinjacobk@gmail.com>
> Sent: Thursday, June 9, 2022 12:40 PM
> To: Richardson, Bruce <bruce.richardson@intel.com>
> Cc: McDaniel, Timothy <timothy.mcdaniel@intel.com>; jerinj@marvell.com;
> dev@dpdk.org; Wires, Kent <kent.wires@intel.com>
> Subject: Re: [PATCH v4] event/dlb2: add support for single 512B write of 4 QEs
> 
> On Mon, May 23, 2022 at 10:25 PM Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> >
> > On Mon, May 23, 2022 at 05:52:06PM +0100, McDaniel, Timothy wrote:
> > >
> > >
> 
> > > >
> > > > Instead, I'd recommend doing as other places in DPDK - such as in ACL
> > > > library, or i40e or ice net drivers - where we not only check the current
> > > > build support, but also check the compiler support. That way, even if we
> > > > are building for e.g. a target of AVX2, we can still build the AVX-512
> > > > parts using the appropriate compiler flags, and choose them
> > > > opportunistically at runtime.
> > >
> > > I do not understand what you are getting at here.
> > >
> > Check out net/i40e/meson.build and hopefully things may become clearer.
> 
> I am updating the patchwork status as "Changes requested" for this patch.
> 
> @McDaniel, Timothy  Please send the updated version. Also, the
> following patch depends on this. I will wait for the merging below
> after this patch is merged.
> 
> https://patches.dpdk.org/project/dpdk/patch/20220410225602.1524724-1-
> timothy.mcdaniel@intel.com/
> https://patches.dpdk.org/project/dpdk/patch/20220410224755.1524117-1-
> timothy.mcdaniel@intel.com/
> 
> 
> >
> > /Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v6] event/dlb2: add support for single 512B write of 4 QEs
  2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
                   ` (2 preceding siblings ...)
  2022-05-23 16:09 ` [PATCH v4] " Timothy McDaniel
@ 2022-06-10 12:43 ` Timothy McDaniel
  2022-06-10 15:41 ` [PATCH v7] " Timothy McDaniel
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 20+ messages in thread
From: Timothy McDaniel @ 2022-06-10 12:43 UTC (permalink / raw)
  To: jerinj; +Cc: bruce.richardson, dev, Kent Wires

On Xeon, 512b accesses are available, so movdir64 instruction is able to
perform 512b read and write to DLB producer port. In order for movdir64
to be able to pull its data from store buffers (store-buffer-forwarding)
(before actual write), data should be in single 512b write format.
This commit add change when code is built for Xeon with 512b AVX support
to make single 512b write of all 4 QEs instead of 4x64b writes.

Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
Acked-by: Kent Wires <kent.wires@intel.com>
===

Changes since V5:
No code changes - just added --in-reply-to and copied Bruce

Changes since V4:
1) Add build-time control for avx512 support to meson.buildi, based
on implementation found in lib/acl/meson.build
2) Add rte_vect_get_max_simd_bitwidth runtime check before using
avx512 instructions

Changes since V3:
1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
for new file name.

Changes since V1:
1) Split out dlb2_event_build_hcws into two implementations, one
that uses AVX512 instructions, and one that does not. Each implementation
is in its own source file in order to avoid build errors if the compiler
does not support the newer AVX512 instructions.
2) Update meson.build to and pull in appropriate source file based on
whether the compiler supports AVX512VL
3) Check if target supports AVX512VL, and use appropriate implementation
based on this runtime check.
---
 drivers/event/dlb2/dlb2.c        | 208 +-----------------------
 drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_priv.h   |  10 ++
 drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
 drivers/event/dlb2/meson.build   |  53 ++++++
 5 files changed, 556 insertions(+), 201 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_avx512.c
 create mode 100644 drivers/event/dlb2/dlb2_sve.c

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 3641ed2942..0b70dc0f51 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -1861,6 +1861,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 
 	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
 
+#ifdef CC_AVX512_SUPPORT
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+		ev_port->qm_port.use_avx512 = true;
+	else
+		ev_port->qm_port.use_avx512 = false;
+#endif
 	return 0;
 }
 
@@ -2457,21 +2464,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
 	return 0;
 }
 
-static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
-	{
-		/* Load-balanced cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
-	},
-	{
-		/* Directed cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
-	},
-};
-
 static inline uint32_t
 dlb2_port_credits_get(struct dlb2_port *qm_port,
 		      enum dlb2_hw_queue_types type)
@@ -2666,192 +2658,6 @@ dlb2_construct_token_pop_qe(struct dlb2_port *qm_port, int idx)
 	qm_port->owed_tokens = 0;
 }
 
-static inline void
-dlb2_event_build_hcws(struct dlb2_port *qm_port,
-		      const struct rte_event ev[],
-		      int num,
-		      uint8_t *sched_type,
-		      uint8_t *queue_id)
-{
-	struct dlb2_enqueue_qe *qe;
-	uint16_t sched_word[4];
-	__m128i sse_qe[2];
-	int i;
-
-	qe = qm_port->qe4;
-
-	sse_qe[0] = _mm_setzero_si128();
-	sse_qe[1] = _mm_setzero_si128();
-
-	switch (num) {
-	case 4:
-		/* Construct the metadata portion of two HCWs in one 128b SSE
-		 * register. HCW metadata is constructed in the SSE registers
-		 * like so:
-		 * sse_qe[0][63:0]:   qe[0]'s metadata
-		 * sse_qe[0][127:64]: qe[1]'s metadata
-		 * sse_qe[1][63:0]:   qe[2]'s metadata
-		 * sse_qe[1][127:64]: qe[3]'s metadata
-		 */
-
-		/* Convert the event operation into a command byte and store it
-		 * in the metadata:
-		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
-		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
-		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
-		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
-		 */
-#define DLB2_QE_CMD_BYTE 7
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[0].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[1].op],
-				DLB2_QE_CMD_BYTE + 8);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[2].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[3].op],
-				DLB2_QE_CMD_BYTE + 8);
-
-		/* Store priority, scheduling type, and queue ID in the sched
-		 * word array because these values are re-used when the
-		 * destination is a directed queue.
-		 */
-		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
-				sched_type[0] << 8 |
-				queue_id[0];
-		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
-				sched_type[1] << 8 |
-				queue_id[1];
-		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
-				sched_type[2] << 8 |
-				queue_id[2];
-		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
-				sched_type[3] << 8 |
-				queue_id[3];
-
-		/* Store the event priority, scheduling type, and queue ID in
-		 * the metadata:
-		 * sse_qe[0][31:16] = sched_word[0]
-		 * sse_qe[0][95:80] = sched_word[1]
-		 * sse_qe[1][31:16] = sched_word[2]
-		 * sse_qe[1][95:80] = sched_word[3]
-		 */
-#define DLB2_QE_QID_SCHED_WORD 1
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[0],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[1],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[2],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[3],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-
-		/* If the destination is a load-balanced queue, store the lock
-		 * ID. If it is a directed queue, DLB places this field in
-		 * bytes 10-11 of the received QE, so we format it accordingly:
-		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
-		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
-		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
-		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
-		 */
-#define DLB2_QE_LOCK_ID_WORD 2
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
-					sched_word[0] : ev[0].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
-					sched_word[1] : ev[1].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
-					sched_word[2] : ev[2].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
-					sched_word[3] : ev[3].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-
-		/* Store the event type and sub event type in the metadata:
-		 * sse_qe[0][15:0]  = flow_id[0]
-		 * sse_qe[0][79:64] = flow_id[1]
-		 * sse_qe[1][15:0]  = flow_id[2]
-		 * sse_qe[1][79:64] = flow_id[3]
-		 */
-#define DLB2_QE_EV_TYPE_WORD 0
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-
-		/* Store the metadata to memory (use the double-precision
-		 * _mm_storeh_pd because there is no integer function for
-		 * storing the upper 64b):
-		 * qe[0] metadata = sse_qe[0][63:0]
-		 * qe[1] metadata = sse_qe[0][127:64]
-		 * qe[2] metadata = sse_qe[1][63:0]
-		 * qe[3] metadata = sse_qe[1][127:64]
-		 */
-		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
-		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
-			      (__m128d)sse_qe[0]);
-		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
-		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
-			      (__m128d)sse_qe[1]);
-
-		qe[0].data = ev[0].u64;
-		qe[1].data = ev[1].u64;
-		qe[2].data = ev[2].u64;
-		qe[3].data = ev[3].u64;
-
-		break;
-	case 3:
-	case 2:
-	case 1:
-		for (i = 0; i < num; i++) {
-			qe[i].cmd_byte =
-				cmd_byte_map[qm_port->is_directed][ev[i].op];
-			qe[i].sched_type = sched_type[i];
-			qe[i].data = ev[i].u64;
-			qe[i].qid = queue_id[i];
-			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
-			qe[i].lock_id = ev[i].flow_id;
-			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
-				struct dlb2_msg_info *info =
-					(struct dlb2_msg_info *)&qe[i].lock_id;
-
-				info->qid = queue_id[i];
-				info->sched_type = DLB2_SCHED_DIRECTED;
-				info->priority = qe[i].priority;
-			}
-			qe[i].u.event_type.major = ev[i].event_type;
-			qe[i].u.event_type.sub = ev[i].sub_event_type;
-		}
-		break;
-	case 0:
-		break;
-	}
-}
-
 static inline int
 dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 			struct dlb2_port *qm_port,
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
new file mode 100644
index 0000000000..ce2d006006
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is used when the compiler on the build machine
+ * supports AVX512VL. We will perform a runtime check before actually
+ * executing those instructions.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		if (qm_port->use_avx512) {
+
+			/*
+			 * 1) Build avx512 QE store and build each
+			 *    QE individually as XMM register
+			 * 2) Merge the 4 XMM registers/QEs into single AVX512
+			 *    register
+			 * 3) Store single avx512 register to &qe[0] (4x QEs
+			 *    stored in 1x store)
+			 */
+
+			__m128i v_qe0 = _mm_setzero_si128();
+			uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+			__m128i v_qe1 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[0], 1);
+			v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+			v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+			__m128i v_qe2 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+			__m128i v_qe3 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 1);
+			v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+			v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+			/* we have 4x XMM registers, one per QE. */
+			__m512i v_all_qes = _mm512_setzero_si512();
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+			/*
+			 * store the 4x QEs in a single register to the scratch
+			 * space of the PMD
+			 */
+			_mm512_store_si512(&qe[0], v_all_qes);
+
+		} else {
+
+			/*
+			 * Store the metadata to memory (use the double-precision
+			 * _mm_storeh_pd because there is no integer function for
+			 * storing the upper 64b):
+			 * qe[0] metadata = sse_qe[0][63:0]
+			 * qe[1] metadata = sse_qe[0][127:64]
+			 * qe[2] metadata = sse_qe[1][63:0]
+			 * qe[3] metadata = sse_qe[1][127:64]
+			 */
+			_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+					 sse_qe[0]);
+			_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+				      (__m128d)sse_qe[0]);
+			_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+					 sse_qe[1]);
+			_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+			qe[0].data = ev[0].u64;
+			qe[1].data = ev[1].u64;
+			qe[2].data = ev[2].u64;
+			qe[3].data = ev[3].u64;
+		}
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 4a06d649ab..e8d2d0c656 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -377,6 +377,9 @@ struct dlb2_port {
 	struct dlb2_eventdev_port *ev_port; /* back ptr */
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
+#ifdef CC_AVX512_SUPPORT
+	bool use_avx512;
+#endif
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -686,6 +689,13 @@ int dlb2_parse_params(const char *params,
 		      struct dlb2_devargs *dlb2_args,
 		      uint8_t version);
 
+void dlb2_event_build_hcws(struct dlb2_port *qm_port,
+			   const struct rte_event ev[],
+			   int num,
+			   uint8_t *sched_type,
+			   uint8_t *queue_id);
+
+
 /* Extern globals */
 extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
 
diff --git a/drivers/event/dlb2/dlb2_sve.c b/drivers/event/dlb2/dlb2_sve.c
new file mode 100644
index 0000000000..82f6588e2a
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_sve.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
index f963589fd3..58146e8aef 100644
--- a/drivers/event/dlb2/meson.build
+++ b/drivers/event/dlb2/meson.build
@@ -19,6 +19,59 @@ sources = files(
         'dlb2_selftest.c',
 )
 
+# compile AVX512 version if:
+# we are building 64-bit binary (checked above) AND binutils
+# can generate proper code
+
+if binutils_ok
+
+    # compile AVX512 version if either:
+    # a. we have AVX512 supported in minimum instruction set
+    #    baseline
+    # b. it's not minimum instruction set, but supported by
+    #    compiler
+    #
+    # in former case, just add avx512 C file to files list
+    # in latter case, compile c file to static lib, using correct
+    # compiler flags, and then have the .o file from static lib
+    # linked into main lib.
+
+    # check if all required flags already enabled (variant a).
+    dlb2_avx512_flags = ['__AVX512F__', '__AVX512VL__',
+                         '__AVX512CD__', '__AVX512BW__']
+
+    dlb2_avx512_on = true
+    foreach f:dlb2_avx512_flags
+
+        if cc.get_define(f, args: machine_args) == ''
+            dlb2_avx512_on = false
+        endif
+    endforeach
+
+    if dlb2_avx512_on == true
+
+        sources += files('dlb2_avx512.c')
+        cflags += '-DCC_AVX512_SUPPORT'
+
+    elif cc.has_multi_arguments('-mavx512f', '-mavx512vl',
+                                '-mavx512cd', '-mavx512bw')
+
+        cflags += '-DCC_AVX512_SUPPORT'
+        avx512_tmplib = static_library('avx512_tmp',
+                               'dlb2_avx512.c',
+			       dependencies: [static_rte_eal,
+			                      static_rte_eventdev],
+                               c_args: cflags +
+                                       ['-mavx512f', '-mavx512vl',
+                                        '-mavx512cd', '-mavx512bw'])
+        objs += avx512_tmplib.extract_objects('dlb2_avx512.c')
+    else
+        sources += files('dlb2_sve.c')
+    endif
+else
+        sources += files('dlb2_sve.c')
+endif
+
 headers = files('rte_pmd_dlb2.h')
 
 deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v7] event/dlb2: add support for single 512B write of 4 QEs
  2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
                   ` (3 preceding siblings ...)
  2022-06-10 12:43 ` [PATCH v6] " Timothy McDaniel
@ 2022-06-10 15:41 ` Timothy McDaniel
  2022-06-10 16:15   ` Bruce Richardson
  2022-06-10 16:27 ` [PATCH v8] " Timothy McDaniel
  2022-06-13 20:39 ` [PATCH v9] " Timothy McDaniel
  6 siblings, 1 reply; 20+ messages in thread
From: Timothy McDaniel @ 2022-06-10 15:41 UTC (permalink / raw)
  To: jerinj; +Cc: bruce.richardson, dev, Kent Wires

On Xeon, 512b accesses are available, so movdir64 instruction is able to
perform 512b read and write to DLB producer port. In order for movdir64
to be able to pull its data from store buffers (store-buffer-forwarding)
(before actual write), data should be in single 512b write format.
This commit add change when code is built for Xeon with 512b AVX support
to make single 512b write of all 4 QEs instead of 4x64b writes.

Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
Acked-by: Kent Wires <kent.wires@intel.com>
===

Changes since V6:
1) Check for AVX512VL only, removing checks for other
AVX512 flags in meson.build
2) rename dlb2_sve.c to dlb2_sse.c

Changes since V5:
No code changes - just added --in-reply-to and copied Bruce

Changes since V4:
1) Add build-time control for avx512 support to meson.buildi, based
on implementation found in lib/acl/meson.build
2) Add rte_vect_get_max_simd_bitwidth runtime check before using
avx512 instructions

Changes since V3:
1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
for new file name.

Changes since V1:
1) Split out dlb2_event_build_hcws into two implementations, one
that uses AVX512 instructions, and one that does not. Each implementation
is in its own source file in order to avoid build errors if the compiler
does not support the newer AVX512 instructions.
2) Update meson.build to and pull in appropriate source file based on
whether the compiler supports AVX512VL
3) Check if target supports AVX512VL, and use appropriate implementation
based on this runtime check.
---
 drivers/event/dlb2/dlb2.c        | 208 +-----------------------
 drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_priv.h   |  10 ++
 drivers/event/dlb2/dlb2_sse.c    | 219 +++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
 drivers/event/dlb2/meson.build   |  47 ++++++
 6 files changed, 769 insertions(+), 201 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_avx512.c
 create mode 100644 drivers/event/dlb2/dlb2_sse.c
 create mode 100644 drivers/event/dlb2/dlb2_sve.c

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 3641ed2942..0b70dc0f51 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -1861,6 +1861,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 
 	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
 
+#ifdef CC_AVX512_SUPPORT
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+		ev_port->qm_port.use_avx512 = true;
+	else
+		ev_port->qm_port.use_avx512 = false;
+#endif
 	return 0;
 }
 
@@ -2457,21 +2464,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
 	return 0;
 }
 
-static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
-	{
-		/* Load-balanced cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
-	},
-	{
-		/* Directed cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
-	},
-};
-
 static inline uint32_t
 dlb2_port_credits_get(struct dlb2_port *qm_port,
 		      enum dlb2_hw_queue_types type)
@@ -2666,192 +2658,6 @@ dlb2_construct_token_pop_qe(struct dlb2_port *qm_port, int idx)
 	qm_port->owed_tokens = 0;
 }
 
-static inline void
-dlb2_event_build_hcws(struct dlb2_port *qm_port,
-		      const struct rte_event ev[],
-		      int num,
-		      uint8_t *sched_type,
-		      uint8_t *queue_id)
-{
-	struct dlb2_enqueue_qe *qe;
-	uint16_t sched_word[4];
-	__m128i sse_qe[2];
-	int i;
-
-	qe = qm_port->qe4;
-
-	sse_qe[0] = _mm_setzero_si128();
-	sse_qe[1] = _mm_setzero_si128();
-
-	switch (num) {
-	case 4:
-		/* Construct the metadata portion of two HCWs in one 128b SSE
-		 * register. HCW metadata is constructed in the SSE registers
-		 * like so:
-		 * sse_qe[0][63:0]:   qe[0]'s metadata
-		 * sse_qe[0][127:64]: qe[1]'s metadata
-		 * sse_qe[1][63:0]:   qe[2]'s metadata
-		 * sse_qe[1][127:64]: qe[3]'s metadata
-		 */
-
-		/* Convert the event operation into a command byte and store it
-		 * in the metadata:
-		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
-		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
-		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
-		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
-		 */
-#define DLB2_QE_CMD_BYTE 7
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[0].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[1].op],
-				DLB2_QE_CMD_BYTE + 8);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[2].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[3].op],
-				DLB2_QE_CMD_BYTE + 8);
-
-		/* Store priority, scheduling type, and queue ID in the sched
-		 * word array because these values are re-used when the
-		 * destination is a directed queue.
-		 */
-		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
-				sched_type[0] << 8 |
-				queue_id[0];
-		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
-				sched_type[1] << 8 |
-				queue_id[1];
-		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
-				sched_type[2] << 8 |
-				queue_id[2];
-		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
-				sched_type[3] << 8 |
-				queue_id[3];
-
-		/* Store the event priority, scheduling type, and queue ID in
-		 * the metadata:
-		 * sse_qe[0][31:16] = sched_word[0]
-		 * sse_qe[0][95:80] = sched_word[1]
-		 * sse_qe[1][31:16] = sched_word[2]
-		 * sse_qe[1][95:80] = sched_word[3]
-		 */
-#define DLB2_QE_QID_SCHED_WORD 1
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[0],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[1],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[2],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[3],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-
-		/* If the destination is a load-balanced queue, store the lock
-		 * ID. If it is a directed queue, DLB places this field in
-		 * bytes 10-11 of the received QE, so we format it accordingly:
-		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
-		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
-		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
-		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
-		 */
-#define DLB2_QE_LOCK_ID_WORD 2
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
-					sched_word[0] : ev[0].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
-					sched_word[1] : ev[1].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
-					sched_word[2] : ev[2].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
-					sched_word[3] : ev[3].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-
-		/* Store the event type and sub event type in the metadata:
-		 * sse_qe[0][15:0]  = flow_id[0]
-		 * sse_qe[0][79:64] = flow_id[1]
-		 * sse_qe[1][15:0]  = flow_id[2]
-		 * sse_qe[1][79:64] = flow_id[3]
-		 */
-#define DLB2_QE_EV_TYPE_WORD 0
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-
-		/* Store the metadata to memory (use the double-precision
-		 * _mm_storeh_pd because there is no integer function for
-		 * storing the upper 64b):
-		 * qe[0] metadata = sse_qe[0][63:0]
-		 * qe[1] metadata = sse_qe[0][127:64]
-		 * qe[2] metadata = sse_qe[1][63:0]
-		 * qe[3] metadata = sse_qe[1][127:64]
-		 */
-		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
-		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
-			      (__m128d)sse_qe[0]);
-		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
-		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
-			      (__m128d)sse_qe[1]);
-
-		qe[0].data = ev[0].u64;
-		qe[1].data = ev[1].u64;
-		qe[2].data = ev[2].u64;
-		qe[3].data = ev[3].u64;
-
-		break;
-	case 3:
-	case 2:
-	case 1:
-		for (i = 0; i < num; i++) {
-			qe[i].cmd_byte =
-				cmd_byte_map[qm_port->is_directed][ev[i].op];
-			qe[i].sched_type = sched_type[i];
-			qe[i].data = ev[i].u64;
-			qe[i].qid = queue_id[i];
-			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
-			qe[i].lock_id = ev[i].flow_id;
-			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
-				struct dlb2_msg_info *info =
-					(struct dlb2_msg_info *)&qe[i].lock_id;
-
-				info->qid = queue_id[i];
-				info->sched_type = DLB2_SCHED_DIRECTED;
-				info->priority = qe[i].priority;
-			}
-			qe[i].u.event_type.major = ev[i].event_type;
-			qe[i].u.event_type.sub = ev[i].sub_event_type;
-		}
-		break;
-	case 0:
-		break;
-	}
-}
-
 static inline int
 dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 			struct dlb2_port *qm_port,
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
new file mode 100644
index 0000000000..ce2d006006
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is used when the compiler on the build machine
+ * supports AVX512VL. We will perform a runtime check before actually
+ * executing those instructions.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		if (qm_port->use_avx512) {
+
+			/*
+			 * 1) Build avx512 QE store and build each
+			 *    QE individually as XMM register
+			 * 2) Merge the 4 XMM registers/QEs into single AVX512
+			 *    register
+			 * 3) Store single avx512 register to &qe[0] (4x QEs
+			 *    stored in 1x store)
+			 */
+
+			__m128i v_qe0 = _mm_setzero_si128();
+			uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+			__m128i v_qe1 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[0], 1);
+			v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+			v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+			__m128i v_qe2 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+			__m128i v_qe3 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 1);
+			v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+			v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+			/* we have 4x XMM registers, one per QE. */
+			__m512i v_all_qes = _mm512_setzero_si512();
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+			/*
+			 * store the 4x QEs in a single register to the scratch
+			 * space of the PMD
+			 */
+			_mm512_store_si512(&qe[0], v_all_qes);
+
+		} else {
+
+			/*
+			 * Store the metadata to memory (use the double-precision
+			 * _mm_storeh_pd because there is no integer function for
+			 * storing the upper 64b):
+			 * qe[0] metadata = sse_qe[0][63:0]
+			 * qe[1] metadata = sse_qe[0][127:64]
+			 * qe[2] metadata = sse_qe[1][63:0]
+			 * qe[3] metadata = sse_qe[1][127:64]
+			 */
+			_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+					 sse_qe[0]);
+			_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+				      (__m128d)sse_qe[0]);
+			_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+					 sse_qe[1]);
+			_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+			qe[0].data = ev[0].u64;
+			qe[1].data = ev[1].u64;
+			qe[2].data = ev[2].u64;
+			qe[3].data = ev[3].u64;
+		}
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 4a06d649ab..e8d2d0c656 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -377,6 +377,9 @@ struct dlb2_port {
 	struct dlb2_eventdev_port *ev_port; /* back ptr */
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
+#ifdef CC_AVX512_SUPPORT
+	bool use_avx512;
+#endif
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -686,6 +689,13 @@ int dlb2_parse_params(const char *params,
 		      struct dlb2_devargs *dlb2_args,
 		      uint8_t version);
 
+void dlb2_event_build_hcws(struct dlb2_port *qm_port,
+			   const struct rte_event ev[],
+			   int num,
+			   uint8_t *sched_type,
+			   uint8_t *queue_id);
+
+
 /* Extern globals */
 extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
 
diff --git a/drivers/event/dlb2/dlb2_sse.c b/drivers/event/dlb2/dlb2_sse.c
new file mode 100644
index 0000000000..82f6588e2a
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_sse.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_sve.c b/drivers/event/dlb2/dlb2_sve.c
new file mode 100644
index 0000000000..82f6588e2a
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_sve.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
index f963589fd3..51ea5ec546 100644
--- a/drivers/event/dlb2/meson.build
+++ b/drivers/event/dlb2/meson.build
@@ -19,6 +19,53 @@ sources = files(
         'dlb2_selftest.c',
 )
 
+# compile AVX512 version if:
+# we are building 64-bit binary (checked above) AND binutils
+# can generate proper code
+
+if binutils_ok
+
+    # compile AVX512 version if either:
+    # a. we have AVX512VL supported in minimum instruction set
+    #    baseline
+    # b. it's not minimum instruction set, but supported by
+    #    compiler
+    #
+    # in former case, just add avx512 C file to files list
+    # in latter case, compile c file to static lib, using correct
+    # compiler flags, and then have the .o file from static lib
+    # linked into main lib.
+
+    # check if all required flags already enabled (variant a).
+    dlb2_avx512_on = false
+    if cc.get_define(f, args: machine_args) == '__AVX512VL__'
+        dlb2_avx512_on = true
+    endif
+
+    if dlb2_avx512_on == true
+
+        sources += files('dlb2_avx512.c')
+        cflags += '-DCC_AVX512_SUPPORT'
+
+    elif cc.has_multi_arguments('-mavx512f', '-mavx512vl',
+                                '-mavx512cd', '-mavx512bw')
+
+        cflags += '-DCC_AVX512_SUPPORT'
+        avx512_tmplib = static_library('avx512_tmp',
+                               'dlb2_avx512.c',
+			       dependencies: [static_rte_eal,
+			                      static_rte_eventdev],
+                               c_args: cflags +
+                                       ['-mavx512f', '-mavx512vl',
+                                        '-mavx512cd', '-mavx512bw'])
+        objs += avx512_tmplib.extract_objects('dlb2_avx512.c')
+    else
+        sources += files('dlb2_sse.c')
+    endif
+else
+        sources += files('dlb2_sse.c')
+endif
+
 headers = files('rte_pmd_dlb2.h')
 
 deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v7] event/dlb2: add support for single 512B write of 4 QEs
  2022-06-10 15:41 ` [PATCH v7] " Timothy McDaniel
@ 2022-06-10 16:15   ` Bruce Richardson
  0 siblings, 0 replies; 20+ messages in thread
From: Bruce Richardson @ 2022-06-10 16:15 UTC (permalink / raw)
  To: Timothy McDaniel; +Cc: jerinj, dev, Kent Wires

On Fri, Jun 10, 2022 at 10:41:25AM -0500, Timothy McDaniel wrote:
> On Xeon, 512b accesses are available, so movdir64 instruction is able to
> perform 512b read and write to DLB producer port. In order for movdir64
> to be able to pull its data from store buffers (store-buffer-forwarding)
> (before actual write), data should be in single 512b write format.
> This commit add change when code is built for Xeon with 512b AVX support
> to make single 512b write of all 4 QEs instead of 4x64b writes.
> 
> Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> Acked-by: Kent Wires <kent.wires@intel.com>
> ===
> 
> Changes since V6:
> 1) Check for AVX512VL only, removing checks for other
> AVX512 flags in meson.build
> 2) rename dlb2_sve.c to dlb2_sse.c
> 
> Changes since V5:
> No code changes - just added --in-reply-to and copied Bruce
> 
> Changes since V4:
> 1) Add build-time control for avx512 support to meson.buildi, based
> on implementation found in lib/acl/meson.build
> 2) Add rte_vect_get_max_simd_bitwidth runtime check before using
> avx512 instructions
> 
> Changes since V3:
> 1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
> for new file name.
> 
> Changes since V1:
> 1) Split out dlb2_event_build_hcws into two implementations, one
> that uses AVX512 instructions, and one that does not. Each implementation
> is in its own source file in order to avoid build errors if the compiler
> does not support the newer AVX512 instructions.
> 2) Update meson.build to and pull in appropriate source file based on
> whether the compiler supports AVX512VL
> 3) Check if target supports AVX512VL, and use appropriate implementation
> based on this runtime check.
> ---
>  drivers/event/dlb2/dlb2.c        | 208 +-----------------------
>  drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
>  drivers/event/dlb2/dlb2_priv.h   |  10 ++
>  drivers/event/dlb2/dlb2_sse.c    | 219 +++++++++++++++++++++++++
>  drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
>  drivers/event/dlb2/meson.build   |  47 ++++++
>  6 files changed, 769 insertions(+), 201 deletions(-)
>  create mode 100644 drivers/event/dlb2/dlb2_avx512.c
>  create mode 100644 drivers/event/dlb2/dlb2_sse.c
>  create mode 100644 drivers/event/dlb2/dlb2_sve.c
> 
<snip>
> diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
> index f963589fd3..51ea5ec546 100644
> --- a/drivers/event/dlb2/meson.build
> +++ b/drivers/event/dlb2/meson.build
> @@ -19,6 +19,53 @@ sources = files(
>          'dlb2_selftest.c',
>  )
>  
> +# compile AVX512 version if:
> +# we are building 64-bit binary (checked above) AND binutils
> +# can generate proper code
> +
> +if binutils_ok
> +
> +    # compile AVX512 version if either:
> +    # a. we have AVX512VL supported in minimum instruction set
> +    #    baseline
> +    # b. it's not minimum instruction set, but supported by
> +    #    compiler
> +    #
> +    # in former case, just add avx512 C file to files list
> +    # in latter case, compile c file to static lib, using correct
> +    # compiler flags, and then have the .o file from static lib
> +    # linked into main lib.
> +
> +    # check if all required flags already enabled (variant a).
> +    dlb2_avx512_on = false
> +    if cc.get_define(f, args: machine_args) == '__AVX512VL__'
> +        dlb2_avx512_on = true
> +    endif
> +
> +    if dlb2_avx512_on == true
> +
> +        sources += files('dlb2_avx512.c')
> +        cflags += '-DCC_AVX512_SUPPORT'
> +
> +    elif cc.has_multi_arguments('-mavx512f', '-mavx512vl',
> +                                '-mavx512cd', '-mavx512bw')
> +
> +        cflags += '-DCC_AVX512_SUPPORT'
> +        avx512_tmplib = static_library('avx512_tmp',
> +                               'dlb2_avx512.c',
> +			       dependencies: [static_rte_eal,
> +			                      static_rte_eventdev],

Minor nit - incorrect whitespace

> +                               c_args: cflags +
> +                                       ['-mavx512f', '-mavx512vl',
> +                                        '-mavx512cd', '-mavx512bw'])
> +        objs += avx512_tmplib.extract_objects('dlb2_avx512.c')
> +    else
> +        sources += files('dlb2_sse.c')
> +    endif
> +else
> +        sources += files('dlb2_sse.c')
> +endif
> +
>  headers = files('rte_pmd_dlb2.h')
>  
>  deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']

These meson.build file changes look ok to me now.

/Bruce

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v8] event/dlb2: add support for single 512B write of 4 QEs
  2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
                   ` (4 preceding siblings ...)
  2022-06-10 15:41 ` [PATCH v7] " Timothy McDaniel
@ 2022-06-10 16:27 ` Timothy McDaniel
  2022-06-13  6:30   ` Jerin Jacob
  2022-06-13 20:39 ` [PATCH v9] " Timothy McDaniel
  6 siblings, 1 reply; 20+ messages in thread
From: Timothy McDaniel @ 2022-06-10 16:27 UTC (permalink / raw)
  To: jerinj; +Cc: bruce.richardson, dev, Kent Wires

On Xeon, 512b accesses are available, so movdir64 instruction is able to
perform 512b read and write to DLB producer port. In order for movdir64
to be able to pull its data from store buffers (store-buffer-forwarding)
(before actual write), data should be in single 512b write format.
This commit add change when code is built for Xeon with 512b AVX support
to make single 512b write of all 4 QEs instead of 4x64b writes.

Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
Acked-by: Kent Wires <kent.wires@intel.com>
===

Changes since V7:
Fixed whitespace issue in meson.build

Changes since V6:
1) Check for AVX512VL only, removing checks for other
AVX512 flags in meson.build
2) rename dlb2_sve.c to dlb2_sse.c

Changes since V5:
No code changes - just added --in-reply-to and copied Bruce

Changes since V4:
1) Add build-time control for avx512 support to meson.buildi, based
on implementation found in lib/acl/meson.build
2) Add rte_vect_get_max_simd_bitwidth runtime check before using
avx512 instructions

Changes since V3:
1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
for new file name.

Changes since V1:
1) Split out dlb2_event_build_hcws into two implementations, one
that uses AVX512 instructions, and one that does not. Each implementation
is in its own source file in order to avoid build errors if the compiler
does not support the newer AVX512 instructions.
2) Update meson.build to and pull in appropriate source file based on
whether the compiler supports AVX512VL
3) Check if target supports AVX512VL, and use appropriate implementation
based on this runtime check.
---
 drivers/event/dlb2/dlb2.c        | 208 +-----------------------
 drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_priv.h   |  10 ++
 drivers/event/dlb2/dlb2_sse.c    | 219 +++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_sve.c    | 219 +++++++++++++++++++++++++
 drivers/event/dlb2/meson.build   |  47 ++++++
 6 files changed, 769 insertions(+), 201 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_avx512.c
 create mode 100644 drivers/event/dlb2/dlb2_sse.c
 create mode 100644 drivers/event/dlb2/dlb2_sve.c

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 3641ed2942..0b70dc0f51 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -1861,6 +1861,13 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 
 	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
 
+#ifdef CC_AVX512_SUPPORT
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+		ev_port->qm_port.use_avx512 = true;
+	else
+		ev_port->qm_port.use_avx512 = false;
+#endif
 	return 0;
 }
 
@@ -2457,21 +2464,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
 	return 0;
 }
 
-static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
-	{
-		/* Load-balanced cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
-	},
-	{
-		/* Directed cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
-	},
-};
-
 static inline uint32_t
 dlb2_port_credits_get(struct dlb2_port *qm_port,
 		      enum dlb2_hw_queue_types type)
@@ -2666,192 +2658,6 @@ dlb2_construct_token_pop_qe(struct dlb2_port *qm_port, int idx)
 	qm_port->owed_tokens = 0;
 }
 
-static inline void
-dlb2_event_build_hcws(struct dlb2_port *qm_port,
-		      const struct rte_event ev[],
-		      int num,
-		      uint8_t *sched_type,
-		      uint8_t *queue_id)
-{
-	struct dlb2_enqueue_qe *qe;
-	uint16_t sched_word[4];
-	__m128i sse_qe[2];
-	int i;
-
-	qe = qm_port->qe4;
-
-	sse_qe[0] = _mm_setzero_si128();
-	sse_qe[1] = _mm_setzero_si128();
-
-	switch (num) {
-	case 4:
-		/* Construct the metadata portion of two HCWs in one 128b SSE
-		 * register. HCW metadata is constructed in the SSE registers
-		 * like so:
-		 * sse_qe[0][63:0]:   qe[0]'s metadata
-		 * sse_qe[0][127:64]: qe[1]'s metadata
-		 * sse_qe[1][63:0]:   qe[2]'s metadata
-		 * sse_qe[1][127:64]: qe[3]'s metadata
-		 */
-
-		/* Convert the event operation into a command byte and store it
-		 * in the metadata:
-		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
-		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
-		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
-		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
-		 */
-#define DLB2_QE_CMD_BYTE 7
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[0].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[1].op],
-				DLB2_QE_CMD_BYTE + 8);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[2].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[3].op],
-				DLB2_QE_CMD_BYTE + 8);
-
-		/* Store priority, scheduling type, and queue ID in the sched
-		 * word array because these values are re-used when the
-		 * destination is a directed queue.
-		 */
-		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
-				sched_type[0] << 8 |
-				queue_id[0];
-		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
-				sched_type[1] << 8 |
-				queue_id[1];
-		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
-				sched_type[2] << 8 |
-				queue_id[2];
-		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
-				sched_type[3] << 8 |
-				queue_id[3];
-
-		/* Store the event priority, scheduling type, and queue ID in
-		 * the metadata:
-		 * sse_qe[0][31:16] = sched_word[0]
-		 * sse_qe[0][95:80] = sched_word[1]
-		 * sse_qe[1][31:16] = sched_word[2]
-		 * sse_qe[1][95:80] = sched_word[3]
-		 */
-#define DLB2_QE_QID_SCHED_WORD 1
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[0],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[1],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[2],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[3],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-
-		/* If the destination is a load-balanced queue, store the lock
-		 * ID. If it is a directed queue, DLB places this field in
-		 * bytes 10-11 of the received QE, so we format it accordingly:
-		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
-		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
-		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
-		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
-		 */
-#define DLB2_QE_LOCK_ID_WORD 2
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
-					sched_word[0] : ev[0].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
-					sched_word[1] : ev[1].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
-					sched_word[2] : ev[2].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
-					sched_word[3] : ev[3].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-
-		/* Store the event type and sub event type in the metadata:
-		 * sse_qe[0][15:0]  = flow_id[0]
-		 * sse_qe[0][79:64] = flow_id[1]
-		 * sse_qe[1][15:0]  = flow_id[2]
-		 * sse_qe[1][79:64] = flow_id[3]
-		 */
-#define DLB2_QE_EV_TYPE_WORD 0
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-
-		/* Store the metadata to memory (use the double-precision
-		 * _mm_storeh_pd because there is no integer function for
-		 * storing the upper 64b):
-		 * qe[0] metadata = sse_qe[0][63:0]
-		 * qe[1] metadata = sse_qe[0][127:64]
-		 * qe[2] metadata = sse_qe[1][63:0]
-		 * qe[3] metadata = sse_qe[1][127:64]
-		 */
-		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
-		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
-			      (__m128d)sse_qe[0]);
-		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
-		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
-			      (__m128d)sse_qe[1]);
-
-		qe[0].data = ev[0].u64;
-		qe[1].data = ev[1].u64;
-		qe[2].data = ev[2].u64;
-		qe[3].data = ev[3].u64;
-
-		break;
-	case 3:
-	case 2:
-	case 1:
-		for (i = 0; i < num; i++) {
-			qe[i].cmd_byte =
-				cmd_byte_map[qm_port->is_directed][ev[i].op];
-			qe[i].sched_type = sched_type[i];
-			qe[i].data = ev[i].u64;
-			qe[i].qid = queue_id[i];
-			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
-			qe[i].lock_id = ev[i].flow_id;
-			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
-				struct dlb2_msg_info *info =
-					(struct dlb2_msg_info *)&qe[i].lock_id;
-
-				info->qid = queue_id[i];
-				info->sched_type = DLB2_SCHED_DIRECTED;
-				info->priority = qe[i].priority;
-			}
-			qe[i].u.event_type.major = ev[i].event_type;
-			qe[i].u.event_type.sub = ev[i].sub_event_type;
-		}
-		break;
-	case 0:
-		break;
-	}
-}
-
 static inline int
 dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 			struct dlb2_port *qm_port,
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
new file mode 100644
index 0000000000..ce2d006006
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is used when the compiler on the build machine
+ * supports AVX512VL. We will perform a runtime check before actually
+ * executing those instructions.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		if (qm_port->use_avx512) {
+
+			/*
+			 * 1) Build avx512 QE store and build each
+			 *    QE individually as XMM register
+			 * 2) Merge the 4 XMM registers/QEs into single AVX512
+			 *    register
+			 * 3) Store single avx512 register to &qe[0] (4x QEs
+			 *    stored in 1x store)
+			 */
+
+			__m128i v_qe0 = _mm_setzero_si128();
+			uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+			__m128i v_qe1 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[0], 1);
+			v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+			v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+			__m128i v_qe2 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+			__m128i v_qe3 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 1);
+			v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+			v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+			/* we have 4x XMM registers, one per QE. */
+			__m512i v_all_qes = _mm512_setzero_si512();
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+			/*
+			 * store the 4x QEs in a single register to the scratch
+			 * space of the PMD
+			 */
+			_mm512_store_si512(&qe[0], v_all_qes);
+
+		} else {
+
+			/*
+			 * Store the metadata to memory (use the double-precision
+			 * _mm_storeh_pd because there is no integer function for
+			 * storing the upper 64b):
+			 * qe[0] metadata = sse_qe[0][63:0]
+			 * qe[1] metadata = sse_qe[0][127:64]
+			 * qe[2] metadata = sse_qe[1][63:0]
+			 * qe[3] metadata = sse_qe[1][127:64]
+			 */
+			_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+					 sse_qe[0]);
+			_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+				      (__m128d)sse_qe[0]);
+			_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+					 sse_qe[1]);
+			_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+			qe[0].data = ev[0].u64;
+			qe[1].data = ev[1].u64;
+			qe[2].data = ev[2].u64;
+			qe[3].data = ev[3].u64;
+		}
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 4a06d649ab..e8d2d0c656 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -377,6 +377,9 @@ struct dlb2_port {
 	struct dlb2_eventdev_port *ev_port; /* back ptr */
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
+#ifdef CC_AVX512_SUPPORT
+	bool use_avx512;
+#endif
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -686,6 +689,13 @@ int dlb2_parse_params(const char *params,
 		      struct dlb2_devargs *dlb2_args,
 		      uint8_t version);
 
+void dlb2_event_build_hcws(struct dlb2_port *qm_port,
+			   const struct rte_event ev[],
+			   int num,
+			   uint8_t *sched_type,
+			   uint8_t *queue_id);
+
+
 /* Extern globals */
 extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
 
diff --git a/drivers/event/dlb2/dlb2_sse.c b/drivers/event/dlb2/dlb2_sse.c
new file mode 100644
index 0000000000..82f6588e2a
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_sse.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_sve.c b/drivers/event/dlb2/dlb2_sve.c
new file mode 100644
index 0000000000..82f6588e2a
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_sve.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
index f963589fd3..bc5ab15e78 100644
--- a/drivers/event/dlb2/meson.build
+++ b/drivers/event/dlb2/meson.build
@@ -19,6 +19,53 @@ sources = files(
         'dlb2_selftest.c',
 )
 
+# compile AVX512 version if:
+# we are building 64-bit binary (checked above) AND binutils
+# can generate proper code
+
+if binutils_ok
+
+    # compile AVX512 version if either:
+    # a. we have AVX512VL supported in minimum instruction set
+    #    baseline
+    # b. it's not minimum instruction set, but supported by
+    #    compiler
+    #
+    # in former case, just add avx512 C file to files list
+    # in latter case, compile c file to static lib, using correct
+    # compiler flags, and then have the .o file from static lib
+    # linked into main lib.
+
+    # check if all required flags already enabled (variant a).
+    dlb2_avx512_on = false
+    if cc.get_define(f, args: machine_args) == '__AVX512VL__'
+        dlb2_avx512_on = true
+    endif
+
+    if dlb2_avx512_on == true
+
+        sources += files('dlb2_avx512.c')
+        cflags += '-DCC_AVX512_SUPPORT'
+
+    elif cc.has_multi_arguments('-mavx512f', '-mavx512vl',
+                                '-mavx512cd', '-mavx512bw')
+
+        cflags += '-DCC_AVX512_SUPPORT'
+        avx512_tmplib = static_library('avx512_tmp',
+                               'dlb2_avx512.c',
+                               dependencies: [static_rte_eal,
+			                      static_rte_eventdev],
+                               c_args: cflags +
+                                       ['-mavx512f', '-mavx512vl',
+                                        '-mavx512cd', '-mavx512bw'])
+        objs += avx512_tmplib.extract_objects('dlb2_avx512.c')
+    else
+        sources += files('dlb2_sse.c')
+    endif
+else
+        sources += files('dlb2_sse.c')
+endif
+
 headers = files('rte_pmd_dlb2.h')
 
 deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v8] event/dlb2: add support for single 512B write of 4 QEs
  2022-06-10 16:27 ` [PATCH v8] " Timothy McDaniel
@ 2022-06-13  6:30   ` Jerin Jacob
  0 siblings, 0 replies; 20+ messages in thread
From: Jerin Jacob @ 2022-06-13  6:30 UTC (permalink / raw)
  To: Timothy McDaniel; +Cc: Jerin Jacob, Richardson, Bruce, dpdk-dev, Kent Wires

On Fri, Jun 10, 2022 at 9:58 PM Timothy McDaniel
<timothy.mcdaniel@intel.com> wrote:
>
> On Xeon, 512b accesses are available, so movdir64 instruction is able to
> perform 512b read and write to DLB producer port. In order for movdir64
> to be able to pull its data from store buffers (store-buffer-forwarding)
> (before actual write), data should be in single 512b write format.
> This commit add change when code is built for Xeon with 512b AVX support
> to make single 512b write of all 4 QEs instead of 4x64b writes.
>
> Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> Acked-by: Kent Wires <kent.wires@intel.com>


@McDaniel, Timothy
Some cosmetic comments are below. Good to merge the next version.
@Richardson, Bruce Hope you are OK with this version patch.



> +#ifdef CC_AVX512_SUPPORT

I think,  We can avoid putting it under CC_AVX512_SUPPORT to avoid clutter.

> +       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
> +           rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
> +               ev_port->qm_port.use_avx512 = true;
> +       else
> +               ev_port->qm_port.use_avx512 = false;
> +#endif
>         return 0;
>  }
>
> @@ -2457,21 +2464,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
>         return 0;
>  }
>
> diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
> new file mode 100644
> index 0000000000..ce2d006006
> --- /dev/null
> +++ b/drivers/event/dlb2/dlb2_avx512.c
> @@ -0,0 +1,267 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2016-2020 Intel Corporation

Fix copyright year.

> + */
> +
> diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
> index 4a06d649ab..e8d2d0c656 100644
> --- a/drivers/event/dlb2/dlb2_priv.h
> +++ b/drivers/event/dlb2/dlb2_priv.h
> @@ -377,6 +377,9 @@ struct dlb2_port {
>         struct dlb2_eventdev_port *ev_port; /* back ptr */
>         bool use_scalar; /* force usage of scalar code */
>         uint16_t hw_credit_quanta;

> +#ifdef CC_AVX512_SUPPORT

Not really need to be under compile time to avoid the compile-time clutter.

> +       bool use_avx512;
> +#endif
>  };
>
> diff --git a/drivers/event/dlb2/dlb2_sse.c b/drivers/event/dlb2/dlb2_sse.c
> new file mode 100644
> index 0000000000..82f6588e2a
> --- /dev/null
> +++ b/drivers/event/dlb2/dlb2_sse.c
> @@ -0,0 +1,219 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2016-2020 Intel Corporation

Fix copyright year.

> diff --git a/drivers/event/dlb2/dlb2_sve.c b/drivers/event/dlb2/dlb2_sve.c
> new file mode 100644
> index 0000000000..82f6588e2a
> --- /dev/null
> +++ b/drivers/event/dlb2/dlb2_sve.c
> @@ -0,0 +1,219 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2016-2020 Intel Corporation

Fix copyright year.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v9] event/dlb2: add support for single 512B write of 4 QEs
  2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
                   ` (5 preceding siblings ...)
  2022-06-10 16:27 ` [PATCH v8] " Timothy McDaniel
@ 2022-06-13 20:39 ` Timothy McDaniel
  2022-06-14 10:40   ` Jerin Jacob
  6 siblings, 1 reply; 20+ messages in thread
From: Timothy McDaniel @ 2022-06-13 20:39 UTC (permalink / raw)
  To: jerinj; +Cc: bruce.richardson, dev, Kent Wires

On Xeon, 512b accesses are available, so movdir64 instruction is able to
perform 512b read and write to DLB producer port. In order for movdir64
to be able to pull its data from store buffers (store-buffer-forwarding)
(before actual write), data should be in single 512b write format.
This commit add change when code is built for Xeon with 512b AVX support
to make single 512b write of all 4 QEs instead of 4x64b writes.

Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
Acked-by: Kent Wires <kent.wires@intel.com>

===

Changes since V8:
1) Removed compile time control of AVX512 enablement
2) Fixed copyright year in all updated and new source files
3) Further refinement of meson.build - only add avx512vl flag to cflags,
not others

Changes since V7:
Fixed whitespace issue in meson.build

Changes since V6:
1) Check for AVX512VL only, removing checks for other
AVX512 flags in meson.build
2) rename dlb2_sve.c to dlb2_sse.c

Changes since V5:
No code changes - just added --in-reply-to and copied Bruce

Changes since V4:
1) Add build-time control for avx512 support to meson.buildi, based
on implementation found in lib/acl/meson.build
2) Add rte_vect_get_max_simd_bitwidth runtime check before using
avx512 instructions

Changes since V3:
1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
for new file name.

Changes since V1:
1) Split out dlb2_event_build_hcws into two implementations, one
that uses AVX512 instructions, and one that does not. Each implementation
is in its own source file in order to avoid build errors if the compiler
does not support the newer AVX512 instructions.
2) Update meson.build to and pull in appropriate source file based on
whether the compiler supports AVX512VL
3) Check if target supports AVX512VL, and use appropriate implementation
based on this runtime check.
---
 drivers/event/dlb2/dlb2.c        | 209 +-----------------------
 drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
 drivers/event/dlb2/dlb2_priv.h   |  10 +-
 drivers/event/dlb2/dlb2_sse.c    | 219 +++++++++++++++++++++++++
 drivers/event/dlb2/meson.build   |  44 +++++
 5 files changed, 546 insertions(+), 203 deletions(-)
 create mode 100644 drivers/event/dlb2/dlb2_avx512.c
 create mode 100644 drivers/event/dlb2/dlb2_sse.c

diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index 3641ed2942..cf74a4a9f6 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016-2020 Intel Corporation
+ * Copyright(c) 2016-2022 Intel Corporation
  */
 
 #include <assert.h>
@@ -1861,6 +1861,12 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
 
 	dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
 
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+		ev_port->qm_port.use_avx512 = true;
+	else
+		ev_port->qm_port.use_avx512 = false;
+
 	return 0;
 }
 
@@ -2457,21 +2463,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
 	return 0;
 }
 
-static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
-	{
-		/* Load-balanced cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
-	},
-	{
-		/* Directed cmd bytes */
-		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
-		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
-	},
-};
-
 static inline uint32_t
 dlb2_port_credits_get(struct dlb2_port *qm_port,
 		      enum dlb2_hw_queue_types type)
@@ -2666,192 +2657,6 @@ dlb2_construct_token_pop_qe(struct dlb2_port *qm_port, int idx)
 	qm_port->owed_tokens = 0;
 }
 
-static inline void
-dlb2_event_build_hcws(struct dlb2_port *qm_port,
-		      const struct rte_event ev[],
-		      int num,
-		      uint8_t *sched_type,
-		      uint8_t *queue_id)
-{
-	struct dlb2_enqueue_qe *qe;
-	uint16_t sched_word[4];
-	__m128i sse_qe[2];
-	int i;
-
-	qe = qm_port->qe4;
-
-	sse_qe[0] = _mm_setzero_si128();
-	sse_qe[1] = _mm_setzero_si128();
-
-	switch (num) {
-	case 4:
-		/* Construct the metadata portion of two HCWs in one 128b SSE
-		 * register. HCW metadata is constructed in the SSE registers
-		 * like so:
-		 * sse_qe[0][63:0]:   qe[0]'s metadata
-		 * sse_qe[0][127:64]: qe[1]'s metadata
-		 * sse_qe[1][63:0]:   qe[2]'s metadata
-		 * sse_qe[1][127:64]: qe[3]'s metadata
-		 */
-
-		/* Convert the event operation into a command byte and store it
-		 * in the metadata:
-		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
-		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
-		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
-		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
-		 */
-#define DLB2_QE_CMD_BYTE 7
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[0].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
-				cmd_byte_map[qm_port->is_directed][ev[1].op],
-				DLB2_QE_CMD_BYTE + 8);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[2].op],
-				DLB2_QE_CMD_BYTE);
-		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
-				cmd_byte_map[qm_port->is_directed][ev[3].op],
-				DLB2_QE_CMD_BYTE + 8);
-
-		/* Store priority, scheduling type, and queue ID in the sched
-		 * word array because these values are re-used when the
-		 * destination is a directed queue.
-		 */
-		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
-				sched_type[0] << 8 |
-				queue_id[0];
-		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
-				sched_type[1] << 8 |
-				queue_id[1];
-		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
-				sched_type[2] << 8 |
-				queue_id[2];
-		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
-				sched_type[3] << 8 |
-				queue_id[3];
-
-		/* Store the event priority, scheduling type, and queue ID in
-		 * the metadata:
-		 * sse_qe[0][31:16] = sched_word[0]
-		 * sse_qe[0][95:80] = sched_word[1]
-		 * sse_qe[1][31:16] = sched_word[2]
-		 * sse_qe[1][95:80] = sched_word[3]
-		 */
-#define DLB2_QE_QID_SCHED_WORD 1
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[0],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     sched_word[1],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[2],
-					     DLB2_QE_QID_SCHED_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     sched_word[3],
-					     DLB2_QE_QID_SCHED_WORD + 4);
-
-		/* If the destination is a load-balanced queue, store the lock
-		 * ID. If it is a directed queue, DLB places this field in
-		 * bytes 10-11 of the received QE, so we format it accordingly:
-		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
-		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
-		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
-		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
-		 */
-#define DLB2_QE_LOCK_ID_WORD 2
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
-					sched_word[0] : ev[0].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
-					sched_word[1] : ev[1].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
-					sched_word[2] : ev[2].flow_id,
-				DLB2_QE_LOCK_ID_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
-					sched_word[3] : ev[3].flow_id,
-				DLB2_QE_LOCK_ID_WORD + 4);
-
-		/* Store the event type and sub event type in the metadata:
-		 * sse_qe[0][15:0]  = flow_id[0]
-		 * sse_qe[0][79:64] = flow_id[1]
-		 * sse_qe[1][15:0]  = flow_id[2]
-		 * sse_qe[1][79:64] = flow_id[3]
-		 */
-#define DLB2_QE_EV_TYPE_WORD 0
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[0].sub_event_type << 8 |
-						ev[0].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
-					     ev[1].sub_event_type << 8 |
-						ev[1].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[2].sub_event_type << 8 |
-						ev[2].event_type,
-					     DLB2_QE_EV_TYPE_WORD);
-		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
-					     ev[3].sub_event_type << 8 |
-						ev[3].event_type,
-					     DLB2_QE_EV_TYPE_WORD + 4);
-
-		/* Store the metadata to memory (use the double-precision
-		 * _mm_storeh_pd because there is no integer function for
-		 * storing the upper 64b):
-		 * qe[0] metadata = sse_qe[0][63:0]
-		 * qe[1] metadata = sse_qe[0][127:64]
-		 * qe[2] metadata = sse_qe[1][63:0]
-		 * qe[3] metadata = sse_qe[1][127:64]
-		 */
-		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
-		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
-			      (__m128d)sse_qe[0]);
-		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
-		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
-			      (__m128d)sse_qe[1]);
-
-		qe[0].data = ev[0].u64;
-		qe[1].data = ev[1].u64;
-		qe[2].data = ev[2].u64;
-		qe[3].data = ev[3].u64;
-
-		break;
-	case 3:
-	case 2:
-	case 1:
-		for (i = 0; i < num; i++) {
-			qe[i].cmd_byte =
-				cmd_byte_map[qm_port->is_directed][ev[i].op];
-			qe[i].sched_type = sched_type[i];
-			qe[i].data = ev[i].u64;
-			qe[i].qid = queue_id[i];
-			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
-			qe[i].lock_id = ev[i].flow_id;
-			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
-				struct dlb2_msg_info *info =
-					(struct dlb2_msg_info *)&qe[i].lock_id;
-
-				info->qid = queue_id[i];
-				info->sched_type = DLB2_SCHED_DIRECTED;
-				info->priority = qe[i].priority;
-			}
-			qe[i].u.event_type.major = ev[i].event_type;
-			qe[i].u.event_type.sub = ev[i].sub_event_type;
-		}
-		break;
-	case 0:
-		break;
-	}
-}
-
 static inline int
 dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
 			struct dlb2_port *qm_port,
diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
new file mode 100644
index 0000000000..d4aaa04a01
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_avx512.c
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is used when the compiler on the build machine
+ * supports AVX512VL. We will perform a runtime check before actually
+ * executing those instructions.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		if (qm_port->use_avx512) {
+
+			/*
+			 * 1) Build avx512 QE store and build each
+			 *    QE individually as XMM register
+			 * 2) Merge the 4 XMM registers/QEs into single AVX512
+			 *    register
+			 * 3) Store single avx512 register to &qe[0] (4x QEs
+			 *    stored in 1x store)
+			 */
+
+			__m128i v_qe0 = _mm_setzero_si128();
+			uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
+			v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
+
+			__m128i v_qe1 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[0], 1);
+			v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
+			v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
+
+			__m128i v_qe2 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
+			v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
+
+			__m128i v_qe3 = _mm_setzero_si128();
+			meta = _mm_extract_epi64(sse_qe[1], 1);
+			v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
+			v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
+
+			/* we have 4x XMM registers, one per QE. */
+			__m512i v_all_qes = _mm512_setzero_si512();
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
+			v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
+
+			/*
+			 * store the 4x QEs in a single register to the scratch
+			 * space of the PMD
+			 */
+			_mm512_store_si512(&qe[0], v_all_qes);
+
+		} else {
+
+			/*
+			 * Store the metadata to memory (use the double-precision
+			 * _mm_storeh_pd because there is no integer function for
+			 * storing the upper 64b):
+			 * qe[0] metadata = sse_qe[0][63:0]
+			 * qe[1] metadata = sse_qe[0][127:64]
+			 * qe[2] metadata = sse_qe[1][63:0]
+			 * qe[3] metadata = sse_qe[1][127:64]
+			 */
+			_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+					 sse_qe[0]);
+			_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+				      (__m128d)sse_qe[0]);
+			_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+					 sse_qe[1]);
+			_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+			qe[0].data = ev[0].u64;
+			qe[1].data = ev[1].u64;
+			qe[2].data = ev[2].u64;
+			qe[3].data = ev[3].u64;
+		}
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
index 4a06d649ab..df69d57b83 100644
--- a/drivers/event/dlb2/dlb2_priv.h
+++ b/drivers/event/dlb2/dlb2_priv.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016-2020 Intel Corporation
+ * Copyright(c) 2016-2022 Intel Corporation
  */
 
 #ifndef _DLB2_PRIV_H_
@@ -377,6 +377,7 @@ struct dlb2_port {
 	struct dlb2_eventdev_port *ev_port; /* back ptr */
 	bool use_scalar; /* force usage of scalar code */
 	uint16_t hw_credit_quanta;
+	bool use_avx512;
 };
 
 /* Per-process per-port mmio and memory pointers */
@@ -686,6 +687,13 @@ int dlb2_parse_params(const char *params,
 		      struct dlb2_devargs *dlb2_args,
 		      uint8_t version);
 
+void dlb2_event_build_hcws(struct dlb2_port *qm_port,
+			   const struct rte_event ev[],
+			   int num,
+			   uint8_t *sched_type,
+			   uint8_t *queue_id);
+
+
 /* Extern globals */
 extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
 
diff --git a/drivers/event/dlb2/dlb2_sse.c b/drivers/event/dlb2/dlb2_sse.c
new file mode 100644
index 0000000000..8fc12d47f7
--- /dev/null
+++ b/drivers/event/dlb2/dlb2_sse.c
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "dlb2_priv.h"
+#include "dlb2_iface.h"
+#include "dlb2_inline_fns.h"
+
+/*
+ * This source file is only used when the compiler on the build machine
+ * does not support AVX512VL.
+ */
+
+static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
+	{
+		/* Load-balanced cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
+	},
+	{
+		/* Directed cmd bytes */
+		[RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
+		[RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
+	},
+};
+
+void
+dlb2_event_build_hcws(struct dlb2_port *qm_port,
+		      const struct rte_event ev[],
+		      int num,
+		      uint8_t *sched_type,
+		      uint8_t *queue_id)
+{
+	struct dlb2_enqueue_qe *qe;
+	uint16_t sched_word[4];
+	__m128i sse_qe[2];
+	int i;
+
+	qe = qm_port->qe4;
+
+	sse_qe[0] = _mm_setzero_si128();
+	sse_qe[1] = _mm_setzero_si128();
+
+	switch (num) {
+	case 4:
+		/* Construct the metadata portion of two HCWs in one 128b SSE
+		 * register. HCW metadata is constructed in the SSE registers
+		 * like so:
+		 * sse_qe[0][63:0]:   qe[0]'s metadata
+		 * sse_qe[0][127:64]: qe[1]'s metadata
+		 * sse_qe[1][63:0]:   qe[2]'s metadata
+		 * sse_qe[1][127:64]: qe[3]'s metadata
+		 */
+
+		/* Convert the event operation into a command byte and store it
+		 * in the metadata:
+		 * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
+		 * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
+		 * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
+		 * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
+		 */
+#define DLB2_QE_CMD_BYTE 7
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[0].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[0] = _mm_insert_epi8(sse_qe[0],
+				cmd_byte_map[qm_port->is_directed][ev[1].op],
+				DLB2_QE_CMD_BYTE + 8);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[2].op],
+				DLB2_QE_CMD_BYTE);
+		sse_qe[1] = _mm_insert_epi8(sse_qe[1],
+				cmd_byte_map[qm_port->is_directed][ev[3].op],
+				DLB2_QE_CMD_BYTE + 8);
+
+		/* Store priority, scheduling type, and queue ID in the sched
+		 * word array because these values are re-used when the
+		 * destination is a directed queue.
+		 */
+		sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
+				sched_type[0] << 8 |
+				queue_id[0];
+		sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
+				sched_type[1] << 8 |
+				queue_id[1];
+		sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
+				sched_type[2] << 8 |
+				queue_id[2];
+		sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
+				sched_type[3] << 8 |
+				queue_id[3];
+
+		/* Store the event priority, scheduling type, and queue ID in
+		 * the metadata:
+		 * sse_qe[0][31:16] = sched_word[0]
+		 * sse_qe[0][95:80] = sched_word[1]
+		 * sse_qe[1][31:16] = sched_word[2]
+		 * sse_qe[1][95:80] = sched_word[3]
+		 */
+#define DLB2_QE_QID_SCHED_WORD 1
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[0],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     sched_word[1],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[2],
+					     DLB2_QE_QID_SCHED_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     sched_word[3],
+					     DLB2_QE_QID_SCHED_WORD + 4);
+
+		/* If the destination is a load-balanced queue, store the lock
+		 * ID. If it is a directed queue, DLB places this field in
+		 * bytes 10-11 of the received QE, so we format it accordingly:
+		 * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
+		 * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
+		 * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
+		 * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
+		 */
+#define DLB2_QE_LOCK_ID_WORD 2
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[0] == DLB2_SCHED_DIRECTED) ?
+					sched_word[0] : ev[0].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+				(sched_type[1] == DLB2_SCHED_DIRECTED) ?
+					sched_word[1] : ev[1].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[2] == DLB2_SCHED_DIRECTED) ?
+					sched_word[2] : ev[2].flow_id,
+				DLB2_QE_LOCK_ID_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+				(sched_type[3] == DLB2_SCHED_DIRECTED) ?
+					sched_word[3] : ev[3].flow_id,
+				DLB2_QE_LOCK_ID_WORD + 4);
+
+		/* Store the event type and sub event type in the metadata:
+		 * sse_qe[0][15:0]  = flow_id[0]
+		 * sse_qe[0][79:64] = flow_id[1]
+		 * sse_qe[1][15:0]  = flow_id[2]
+		 * sse_qe[1][79:64] = flow_id[3]
+		 */
+#define DLB2_QE_EV_TYPE_WORD 0
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[0].sub_event_type << 8 |
+						ev[0].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[0] = _mm_insert_epi16(sse_qe[0],
+					     ev[1].sub_event_type << 8 |
+						ev[1].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[2].sub_event_type << 8 |
+						ev[2].event_type,
+					     DLB2_QE_EV_TYPE_WORD);
+		sse_qe[1] = _mm_insert_epi16(sse_qe[1],
+					     ev[3].sub_event_type << 8 |
+						ev[3].event_type,
+					     DLB2_QE_EV_TYPE_WORD + 4);
+
+		/*
+		 * Store the metadata to memory (use the double-precision
+		 * _mm_storeh_pd because there is no integer function for
+		 * storing the upper 64b):
+		 * qe[0] metadata = sse_qe[0][63:0]
+		 * qe[1] metadata = sse_qe[0][127:64]
+		 * qe[2] metadata = sse_qe[1][63:0]
+		 * qe[3] metadata = sse_qe[1][127:64]
+		 */
+		_mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
+				 sse_qe[0]);
+		_mm_storeh_pd((double *)&qe[1].u.opaque_data,
+			      (__m128d)sse_qe[0]);
+		_mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
+				 sse_qe[1]);
+		_mm_storeh_pd((double *)&qe[3].u.opaque_data,
+				      (__m128d)sse_qe[1]);
+
+		qe[0].data = ev[0].u64;
+		qe[1].data = ev[1].u64;
+		qe[2].data = ev[2].u64;
+		qe[3].data = ev[3].u64;
+
+		break;
+	case 3:
+	case 2:
+	case 1:
+		for (i = 0; i < num; i++) {
+			qe[i].cmd_byte =
+				cmd_byte_map[qm_port->is_directed][ev[i].op];
+			qe[i].sched_type = sched_type[i];
+			qe[i].data = ev[i].u64;
+			qe[i].qid = queue_id[i];
+			qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
+			qe[i].lock_id = ev[i].flow_id;
+			if (sched_type[i] == DLB2_SCHED_DIRECTED) {
+				struct dlb2_msg_info *info =
+					(struct dlb2_msg_info *)&qe[i].lock_id;
+
+				info->qid = queue_id[i];
+				info->sched_type = DLB2_SCHED_DIRECTED;
+				info->priority = qe[i].priority;
+			}
+			qe[i].u.event_type.major = ev[i].event_type;
+			qe[i].u.event_type.sub = ev[i].sub_event_type;
+		}
+		break;
+	case 0:
+		break;
+	}
+}
diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
index f963589fd3..c08f480570 100644
--- a/drivers/event/dlb2/meson.build
+++ b/drivers/event/dlb2/meson.build
@@ -19,6 +19,50 @@ sources = files(
         'dlb2_selftest.c',
 )
 
+# compile AVX512 version if:
+# we are building 64-bit binary (checked above) AND binutils
+# can generate proper code
+
+if binutils_ok
+
+    # compile AVX512 version if either:
+    # a. we have AVX512VL supported in minimum instruction set
+    #    baseline
+    # b. it's not minimum instruction set, but supported by
+    #    compiler
+    #
+    # in former case, just add avx512 C file to files list
+    # in latter case, compile c file to static lib, using correct
+    # compiler flags, and then have the .o file from static lib
+    # linked into main lib.
+
+    # check if all required flags already enabled (variant a).
+    dlb2_avx512_on = false
+    if cc.get_define(f, args: machine_args) == '__AVX512VL__'
+        dlb2_avx512_on = true
+    endif
+
+    if dlb2_avx512_on == true
+
+        sources += files('dlb2_avx512.c')
+        cflags += '-DCC_AVX512_SUPPORT'
+
+    elif cc.has_multi_arguments('-mavx512vl')
+
+        cflags += '-DCC_AVX512_SUPPORT'
+        avx512_tmplib = static_library('avx512_tmp',
+                               'dlb2_avx512.c',
+                               dependencies: [static_rte_eal,
+			                      static_rte_eventdev],
+                               c_args: cflags + ['-mavx512vl'])
+        objs += avx512_tmplib.extract_objects('dlb2_avx512.c')
+    else
+        sources += files('dlb2_sse.c')
+    endif
+else
+        sources += files('dlb2_sse.c')
+endif
+
 headers = files('rte_pmd_dlb2.h')
 
 deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
-- 
2.25.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v9] event/dlb2: add support for single 512B write of 4 QEs
  2022-06-13 20:39 ` [PATCH v9] " Timothy McDaniel
@ 2022-06-14 10:40   ` Jerin Jacob
  0 siblings, 0 replies; 20+ messages in thread
From: Jerin Jacob @ 2022-06-14 10:40 UTC (permalink / raw)
  To: Timothy McDaniel; +Cc: Jerin Jacob, Richardson, Bruce, dpdk-dev, Kent Wires

On Tue, Jun 14, 2022 at 2:09 AM Timothy McDaniel
<timothy.mcdaniel@intel.com> wrote:
>
> On Xeon, 512b accesses are available, so movdir64 instruction is able to
> perform 512b read and write to DLB producer port. In order for movdir64
> to be able to pull its data from store buffers (store-buffer-forwarding)
> (before actual write), data should be in single 512b write format.
> This commit add change when code is built for Xeon with 512b AVX support
> to make single 512b write of all 4 QEs instead of 4x64b writes.
>
> Signed-off-by: Timothy McDaniel <timothy.mcdaniel@intel.com>
> Acked-by: Kent Wires <kent.wires@intel.com>

Applied to dpdk-next-net-eventdev/for-main. Thanks


>
> ===
>
> Changes since V8:
> 1) Removed compile time control of AVX512 enablement
> 2) Fixed copyright year in all updated and new source files
> 3) Further refinement of meson.build - only add avx512vl flag to cflags,
> not others
>
> Changes since V7:
> Fixed whitespace issue in meson.build
>
> Changes since V6:
> 1) Check for AVX512VL only, removing checks for other
> AVX512 flags in meson.build
> 2) rename dlb2_sve.c to dlb2_sse.c
>
> Changes since V5:
> No code changes - just added --in-reply-to and copied Bruce
>
> Changes since V4:
> 1) Add build-time control for avx512 support to meson.buildi, based
> on implementation found in lib/acl/meson.build
> 2) Add rte_vect_get_max_simd_bitwidth runtime check before using
> avx512 instructions
>
> Changes since V3:
> 1) Renamed dlb2_noavx512.c to dlb2_sve.c, and fixed up meson.build
> for new file name.
>
> Changes since V1:
> 1) Split out dlb2_event_build_hcws into two implementations, one
> that uses AVX512 instructions, and one that does not. Each implementation
> is in its own source file in order to avoid build errors if the compiler
> does not support the newer AVX512 instructions.
> 2) Update meson.build to and pull in appropriate source file based on
> whether the compiler supports AVX512VL
> 3) Check if target supports AVX512VL, and use appropriate implementation
> based on this runtime check.
> ---
>  drivers/event/dlb2/dlb2.c        | 209 +-----------------------
>  drivers/event/dlb2/dlb2_avx512.c | 267 +++++++++++++++++++++++++++++++
>  drivers/event/dlb2/dlb2_priv.h   |  10 +-
>  drivers/event/dlb2/dlb2_sse.c    | 219 +++++++++++++++++++++++++
>  drivers/event/dlb2/meson.build   |  44 +++++
>  5 files changed, 546 insertions(+), 203 deletions(-)
>  create mode 100644 drivers/event/dlb2/dlb2_avx512.c
>  create mode 100644 drivers/event/dlb2/dlb2_sse.c
>
> diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> index 3641ed2942..cf74a4a9f6 100644
> --- a/drivers/event/dlb2/dlb2.c
> +++ b/drivers/event/dlb2/dlb2.c
> @@ -1,5 +1,5 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
> - * Copyright(c) 2016-2020 Intel Corporation
> + * Copyright(c) 2016-2022 Intel Corporation
>   */
>
>  #include <assert.h>
> @@ -1861,6 +1861,12 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
>
>         dev->data->ports[ev_port_id] = &dlb2->ev_ports[ev_port_id];
>
> +       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
> +           rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
> +               ev_port->qm_port.use_avx512 = true;
> +       else
> +               ev_port->qm_port.use_avx512 = false;
> +
>         return 0;
>  }
>
> @@ -2457,21 +2463,6 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
>         return 0;
>  }
>
> -static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
> -       {
> -               /* Load-balanced cmd bytes */
> -               [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> -               [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
> -               [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
> -       },
> -       {
> -               /* Directed cmd bytes */
> -               [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> -               [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
> -               [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
> -       },
> -};
> -
>  static inline uint32_t
>  dlb2_port_credits_get(struct dlb2_port *qm_port,
>                       enum dlb2_hw_queue_types type)
> @@ -2666,192 +2657,6 @@ dlb2_construct_token_pop_qe(struct dlb2_port *qm_port, int idx)
>         qm_port->owed_tokens = 0;
>  }
>
> -static inline void
> -dlb2_event_build_hcws(struct dlb2_port *qm_port,
> -                     const struct rte_event ev[],
> -                     int num,
> -                     uint8_t *sched_type,
> -                     uint8_t *queue_id)
> -{
> -       struct dlb2_enqueue_qe *qe;
> -       uint16_t sched_word[4];
> -       __m128i sse_qe[2];
> -       int i;
> -
> -       qe = qm_port->qe4;
> -
> -       sse_qe[0] = _mm_setzero_si128();
> -       sse_qe[1] = _mm_setzero_si128();
> -
> -       switch (num) {
> -       case 4:
> -               /* Construct the metadata portion of two HCWs in one 128b SSE
> -                * register. HCW metadata is constructed in the SSE registers
> -                * like so:
> -                * sse_qe[0][63:0]:   qe[0]'s metadata
> -                * sse_qe[0][127:64]: qe[1]'s metadata
> -                * sse_qe[1][63:0]:   qe[2]'s metadata
> -                * sse_qe[1][127:64]: qe[3]'s metadata
> -                */
> -
> -               /* Convert the event operation into a command byte and store it
> -                * in the metadata:
> -                * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
> -                * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
> -                * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
> -                * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
> -                */
> -#define DLB2_QE_CMD_BYTE 7
> -               sse_qe[0] = _mm_insert_epi8(sse_qe[0],
> -                               cmd_byte_map[qm_port->is_directed][ev[0].op],
> -                               DLB2_QE_CMD_BYTE);
> -               sse_qe[0] = _mm_insert_epi8(sse_qe[0],
> -                               cmd_byte_map[qm_port->is_directed][ev[1].op],
> -                               DLB2_QE_CMD_BYTE + 8);
> -               sse_qe[1] = _mm_insert_epi8(sse_qe[1],
> -                               cmd_byte_map[qm_port->is_directed][ev[2].op],
> -                               DLB2_QE_CMD_BYTE);
> -               sse_qe[1] = _mm_insert_epi8(sse_qe[1],
> -                               cmd_byte_map[qm_port->is_directed][ev[3].op],
> -                               DLB2_QE_CMD_BYTE + 8);
> -
> -               /* Store priority, scheduling type, and queue ID in the sched
> -                * word array because these values are re-used when the
> -                * destination is a directed queue.
> -                */
> -               sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
> -                               sched_type[0] << 8 |
> -                               queue_id[0];
> -               sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
> -                               sched_type[1] << 8 |
> -                               queue_id[1];
> -               sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
> -                               sched_type[2] << 8 |
> -                               queue_id[2];
> -               sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
> -                               sched_type[3] << 8 |
> -                               queue_id[3];
> -
> -               /* Store the event priority, scheduling type, and queue ID in
> -                * the metadata:
> -                * sse_qe[0][31:16] = sched_word[0]
> -                * sse_qe[0][95:80] = sched_word[1]
> -                * sse_qe[1][31:16] = sched_word[2]
> -                * sse_qe[1][95:80] = sched_word[3]
> -                */
> -#define DLB2_QE_QID_SCHED_WORD 1
> -               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> -                                            sched_word[0],
> -                                            DLB2_QE_QID_SCHED_WORD);
> -               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> -                                            sched_word[1],
> -                                            DLB2_QE_QID_SCHED_WORD + 4);
> -               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> -                                            sched_word[2],
> -                                            DLB2_QE_QID_SCHED_WORD);
> -               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> -                                            sched_word[3],
> -                                            DLB2_QE_QID_SCHED_WORD + 4);
> -
> -               /* If the destination is a load-balanced queue, store the lock
> -                * ID. If it is a directed queue, DLB places this field in
> -                * bytes 10-11 of the received QE, so we format it accordingly:
> -                * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
> -                * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
> -                * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
> -                * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
> -                */
> -#define DLB2_QE_LOCK_ID_WORD 2
> -               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> -                               (sched_type[0] == DLB2_SCHED_DIRECTED) ?
> -                                       sched_word[0] : ev[0].flow_id,
> -                               DLB2_QE_LOCK_ID_WORD);
> -               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> -                               (sched_type[1] == DLB2_SCHED_DIRECTED) ?
> -                                       sched_word[1] : ev[1].flow_id,
> -                               DLB2_QE_LOCK_ID_WORD + 4);
> -               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> -                               (sched_type[2] == DLB2_SCHED_DIRECTED) ?
> -                                       sched_word[2] : ev[2].flow_id,
> -                               DLB2_QE_LOCK_ID_WORD);
> -               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> -                               (sched_type[3] == DLB2_SCHED_DIRECTED) ?
> -                                       sched_word[3] : ev[3].flow_id,
> -                               DLB2_QE_LOCK_ID_WORD + 4);
> -
> -               /* Store the event type and sub event type in the metadata:
> -                * sse_qe[0][15:0]  = flow_id[0]
> -                * sse_qe[0][79:64] = flow_id[1]
> -                * sse_qe[1][15:0]  = flow_id[2]
> -                * sse_qe[1][79:64] = flow_id[3]
> -                */
> -#define DLB2_QE_EV_TYPE_WORD 0
> -               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> -                                            ev[0].sub_event_type << 8 |
> -                                               ev[0].event_type,
> -                                            DLB2_QE_EV_TYPE_WORD);
> -               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> -                                            ev[1].sub_event_type << 8 |
> -                                               ev[1].event_type,
> -                                            DLB2_QE_EV_TYPE_WORD + 4);
> -               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> -                                            ev[2].sub_event_type << 8 |
> -                                               ev[2].event_type,
> -                                            DLB2_QE_EV_TYPE_WORD);
> -               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> -                                            ev[3].sub_event_type << 8 |
> -                                               ev[3].event_type,
> -                                            DLB2_QE_EV_TYPE_WORD + 4);
> -
> -               /* Store the metadata to memory (use the double-precision
> -                * _mm_storeh_pd because there is no integer function for
> -                * storing the upper 64b):
> -                * qe[0] metadata = sse_qe[0][63:0]
> -                * qe[1] metadata = sse_qe[0][127:64]
> -                * qe[2] metadata = sse_qe[1][63:0]
> -                * qe[3] metadata = sse_qe[1][127:64]
> -                */
> -               _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
> -               _mm_storeh_pd((double *)&qe[1].u.opaque_data,
> -                             (__m128d)sse_qe[0]);
> -               _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
> -               _mm_storeh_pd((double *)&qe[3].u.opaque_data,
> -                             (__m128d)sse_qe[1]);
> -
> -               qe[0].data = ev[0].u64;
> -               qe[1].data = ev[1].u64;
> -               qe[2].data = ev[2].u64;
> -               qe[3].data = ev[3].u64;
> -
> -               break;
> -       case 3:
> -       case 2:
> -       case 1:
> -               for (i = 0; i < num; i++) {
> -                       qe[i].cmd_byte =
> -                               cmd_byte_map[qm_port->is_directed][ev[i].op];
> -                       qe[i].sched_type = sched_type[i];
> -                       qe[i].data = ev[i].u64;
> -                       qe[i].qid = queue_id[i];
> -                       qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
> -                       qe[i].lock_id = ev[i].flow_id;
> -                       if (sched_type[i] == DLB2_SCHED_DIRECTED) {
> -                               struct dlb2_msg_info *info =
> -                                       (struct dlb2_msg_info *)&qe[i].lock_id;
> -
> -                               info->qid = queue_id[i];
> -                               info->sched_type = DLB2_SCHED_DIRECTED;
> -                               info->priority = qe[i].priority;
> -                       }
> -                       qe[i].u.event_type.major = ev[i].event_type;
> -                       qe[i].u.event_type.sub = ev[i].sub_event_type;
> -               }
> -               break;
> -       case 0:
> -               break;
> -       }
> -}
> -
>  static inline int
>  dlb2_event_enqueue_prep(struct dlb2_eventdev_port *ev_port,
>                         struct dlb2_port *qm_port,
> diff --git a/drivers/event/dlb2/dlb2_avx512.c b/drivers/event/dlb2/dlb2_avx512.c
> new file mode 100644
> index 0000000000..d4aaa04a01
> --- /dev/null
> +++ b/drivers/event/dlb2/dlb2_avx512.c
> @@ -0,0 +1,267 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <stdint.h>
> +#include <stdbool.h>
> +
> +#include "dlb2_priv.h"
> +#include "dlb2_iface.h"
> +#include "dlb2_inline_fns.h"
> +
> +/*
> + * This source file is used when the compiler on the build machine
> + * supports AVX512VL. We will perform a runtime check before actually
> + * executing those instructions.
> + */
> +
> +static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
> +       {
> +               /* Load-balanced cmd bytes */
> +               [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> +               [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
> +               [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
> +       },
> +       {
> +               /* Directed cmd bytes */
> +               [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> +               [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
> +               [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
> +       },
> +};
> +
> +void
> +dlb2_event_build_hcws(struct dlb2_port *qm_port,
> +                     const struct rte_event ev[],
> +                     int num,
> +                     uint8_t *sched_type,
> +                     uint8_t *queue_id)
> +{
> +       struct dlb2_enqueue_qe *qe;
> +       uint16_t sched_word[4];
> +       __m128i sse_qe[2];
> +       int i;
> +
> +       qe = qm_port->qe4;
> +
> +       sse_qe[0] = _mm_setzero_si128();
> +       sse_qe[1] = _mm_setzero_si128();
> +
> +       switch (num) {
> +       case 4:
> +               /* Construct the metadata portion of two HCWs in one 128b SSE
> +                * register. HCW metadata is constructed in the SSE registers
> +                * like so:
> +                * sse_qe[0][63:0]:   qe[0]'s metadata
> +                * sse_qe[0][127:64]: qe[1]'s metadata
> +                * sse_qe[1][63:0]:   qe[2]'s metadata
> +                * sse_qe[1][127:64]: qe[3]'s metadata
> +                */
> +
> +               /* Convert the event operation into a command byte and store it
> +                * in the metadata:
> +                * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
> +                * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
> +                * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
> +                * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
> +                */
> +#define DLB2_QE_CMD_BYTE 7
> +               sse_qe[0] = _mm_insert_epi8(sse_qe[0],
> +                               cmd_byte_map[qm_port->is_directed][ev[0].op],
> +                               DLB2_QE_CMD_BYTE);
> +               sse_qe[0] = _mm_insert_epi8(sse_qe[0],
> +                               cmd_byte_map[qm_port->is_directed][ev[1].op],
> +                               DLB2_QE_CMD_BYTE + 8);
> +               sse_qe[1] = _mm_insert_epi8(sse_qe[1],
> +                               cmd_byte_map[qm_port->is_directed][ev[2].op],
> +                               DLB2_QE_CMD_BYTE);
> +               sse_qe[1] = _mm_insert_epi8(sse_qe[1],
> +                               cmd_byte_map[qm_port->is_directed][ev[3].op],
> +                               DLB2_QE_CMD_BYTE + 8);
> +
> +               /* Store priority, scheduling type, and queue ID in the sched
> +                * word array because these values are re-used when the
> +                * destination is a directed queue.
> +                */
> +               sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
> +                               sched_type[0] << 8 |
> +                               queue_id[0];
> +               sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
> +                               sched_type[1] << 8 |
> +                               queue_id[1];
> +               sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
> +                               sched_type[2] << 8 |
> +                               queue_id[2];
> +               sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
> +                               sched_type[3] << 8 |
> +                               queue_id[3];
> +
> +               /* Store the event priority, scheduling type, and queue ID in
> +                * the metadata:
> +                * sse_qe[0][31:16] = sched_word[0]
> +                * sse_qe[0][95:80] = sched_word[1]
> +                * sse_qe[1][31:16] = sched_word[2]
> +                * sse_qe[1][95:80] = sched_word[3]
> +                */
> +#define DLB2_QE_QID_SCHED_WORD 1
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            sched_word[0],
> +                                            DLB2_QE_QID_SCHED_WORD);
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            sched_word[1],
> +                                            DLB2_QE_QID_SCHED_WORD + 4);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            sched_word[2],
> +                                            DLB2_QE_QID_SCHED_WORD);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            sched_word[3],
> +                                            DLB2_QE_QID_SCHED_WORD + 4);
> +
> +               /* If the destination is a load-balanced queue, store the lock
> +                * ID. If it is a directed queue, DLB places this field in
> +                * bytes 10-11 of the received QE, so we format it accordingly:
> +                * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
> +                * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
> +                * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
> +                * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
> +                */
> +#define DLB2_QE_LOCK_ID_WORD 2
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                               (sched_type[0] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[0] : ev[0].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD);
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                               (sched_type[1] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[1] : ev[1].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD + 4);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                               (sched_type[2] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[2] : ev[2].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                               (sched_type[3] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[3] : ev[3].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD + 4);
> +
> +               /* Store the event type and sub event type in the metadata:
> +                * sse_qe[0][15:0]  = flow_id[0]
> +                * sse_qe[0][79:64] = flow_id[1]
> +                * sse_qe[1][15:0]  = flow_id[2]
> +                * sse_qe[1][79:64] = flow_id[3]
> +                */
> +#define DLB2_QE_EV_TYPE_WORD 0
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            ev[0].sub_event_type << 8 |
> +                                               ev[0].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD);
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            ev[1].sub_event_type << 8 |
> +                                               ev[1].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD + 4);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            ev[2].sub_event_type << 8 |
> +                                               ev[2].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            ev[3].sub_event_type << 8 |
> +                                               ev[3].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD + 4);
> +
> +               if (qm_port->use_avx512) {
> +
> +                       /*
> +                        * 1) Build avx512 QE store and build each
> +                        *    QE individually as XMM register
> +                        * 2) Merge the 4 XMM registers/QEs into single AVX512
> +                        *    register
> +                        * 3) Store single avx512 register to &qe[0] (4x QEs
> +                        *    stored in 1x store)
> +                        */
> +
> +                       __m128i v_qe0 = _mm_setzero_si128();
> +                       uint64_t meta = _mm_extract_epi64(sse_qe[0], 0);
> +                       v_qe0 = _mm_insert_epi64(v_qe0, ev[0].u64, 0);
> +                       v_qe0 = _mm_insert_epi64(v_qe0, meta, 1);
> +
> +                       __m128i v_qe1 = _mm_setzero_si128();
> +                       meta = _mm_extract_epi64(sse_qe[0], 1);
> +                       v_qe1 = _mm_insert_epi64(v_qe1, ev[1].u64, 0);
> +                       v_qe1 = _mm_insert_epi64(v_qe1, meta, 1);
> +
> +                       __m128i v_qe2 = _mm_setzero_si128();
> +                       meta = _mm_extract_epi64(sse_qe[1], 0);
> +                       v_qe2 = _mm_insert_epi64(v_qe2, ev[2].u64, 0);
> +                       v_qe2 = _mm_insert_epi64(v_qe2, meta, 1);
> +
> +                       __m128i v_qe3 = _mm_setzero_si128();
> +                       meta = _mm_extract_epi64(sse_qe[1], 1);
> +                       v_qe3 = _mm_insert_epi64(v_qe3, ev[3].u64, 0);
> +                       v_qe3 = _mm_insert_epi64(v_qe3, meta, 1);
> +
> +                       /* we have 4x XMM registers, one per QE. */
> +                       __m512i v_all_qes = _mm512_setzero_si512();
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe0, 0);
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe1, 1);
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe2, 2);
> +                       v_all_qes = _mm512_inserti32x4(v_all_qes, v_qe3, 3);
> +
> +                       /*
> +                        * store the 4x QEs in a single register to the scratch
> +                        * space of the PMD
> +                        */
> +                       _mm512_store_si512(&qe[0], v_all_qes);
> +
> +               } else {
> +
> +                       /*
> +                        * Store the metadata to memory (use the double-precision
> +                        * _mm_storeh_pd because there is no integer function for
> +                        * storing the upper 64b):
> +                        * qe[0] metadata = sse_qe[0][63:0]
> +                        * qe[1] metadata = sse_qe[0][127:64]
> +                        * qe[2] metadata = sse_qe[1][63:0]
> +                        * qe[3] metadata = sse_qe[1][127:64]
> +                        */
> +                       _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
> +                                        sse_qe[0]);
> +                       _mm_storeh_pd((double *)&qe[1].u.opaque_data,
> +                                     (__m128d)sse_qe[0]);
> +                       _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
> +                                        sse_qe[1]);
> +                       _mm_storeh_pd((double *)&qe[3].u.opaque_data,
> +                                     (__m128d)sse_qe[1]);
> +
> +                       qe[0].data = ev[0].u64;
> +                       qe[1].data = ev[1].u64;
> +                       qe[2].data = ev[2].u64;
> +                       qe[3].data = ev[3].u64;
> +               }
> +
> +               break;
> +       case 3:
> +       case 2:
> +       case 1:
> +               for (i = 0; i < num; i++) {
> +                       qe[i].cmd_byte =
> +                               cmd_byte_map[qm_port->is_directed][ev[i].op];
> +                       qe[i].sched_type = sched_type[i];
> +                       qe[i].data = ev[i].u64;
> +                       qe[i].qid = queue_id[i];
> +                       qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
> +                       qe[i].lock_id = ev[i].flow_id;
> +                       if (sched_type[i] == DLB2_SCHED_DIRECTED) {
> +                               struct dlb2_msg_info *info =
> +                                       (struct dlb2_msg_info *)&qe[i].lock_id;
> +
> +                               info->qid = queue_id[i];
> +                               info->sched_type = DLB2_SCHED_DIRECTED;
> +                               info->priority = qe[i].priority;
> +                       }
> +                       qe[i].u.event_type.major = ev[i].event_type;
> +                       qe[i].u.event_type.sub = ev[i].sub_event_type;
> +               }
> +               break;
> +       case 0:
> +               break;
> +       }
> +}
> diff --git a/drivers/event/dlb2/dlb2_priv.h b/drivers/event/dlb2/dlb2_priv.h
> index 4a06d649ab..df69d57b83 100644
> --- a/drivers/event/dlb2/dlb2_priv.h
> +++ b/drivers/event/dlb2/dlb2_priv.h
> @@ -1,5 +1,5 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
> - * Copyright(c) 2016-2020 Intel Corporation
> + * Copyright(c) 2016-2022 Intel Corporation
>   */
>
>  #ifndef _DLB2_PRIV_H_
> @@ -377,6 +377,7 @@ struct dlb2_port {
>         struct dlb2_eventdev_port *ev_port; /* back ptr */
>         bool use_scalar; /* force usage of scalar code */
>         uint16_t hw_credit_quanta;
> +       bool use_avx512;
>  };
>
>  /* Per-process per-port mmio and memory pointers */
> @@ -686,6 +687,13 @@ int dlb2_parse_params(const char *params,
>                       struct dlb2_devargs *dlb2_args,
>                       uint8_t version);
>
> +void dlb2_event_build_hcws(struct dlb2_port *qm_port,
> +                          const struct rte_event ev[],
> +                          int num,
> +                          uint8_t *sched_type,
> +                          uint8_t *queue_id);
> +
> +
>  /* Extern globals */
>  extern struct process_local_port_data dlb2_port[][DLB2_NUM_PORT_TYPES];
>
> diff --git a/drivers/event/dlb2/dlb2_sse.c b/drivers/event/dlb2/dlb2_sse.c
> new file mode 100644
> index 0000000000..8fc12d47f7
> --- /dev/null
> +++ b/drivers/event/dlb2/dlb2_sse.c
> @@ -0,0 +1,219 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> +
> +#include <stdint.h>
> +#include <stdbool.h>
> +
> +#include "dlb2_priv.h"
> +#include "dlb2_iface.h"
> +#include "dlb2_inline_fns.h"
> +
> +/*
> + * This source file is only used when the compiler on the build machine
> + * does not support AVX512VL.
> + */
> +
> +static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
> +       {
> +               /* Load-balanced cmd bytes */
> +               [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> +               [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
> +               [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
> +       },
> +       {
> +               /* Directed cmd bytes */
> +               [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> +               [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
> +               [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
> +       },
> +};
> +
> +void
> +dlb2_event_build_hcws(struct dlb2_port *qm_port,
> +                     const struct rte_event ev[],
> +                     int num,
> +                     uint8_t *sched_type,
> +                     uint8_t *queue_id)
> +{
> +       struct dlb2_enqueue_qe *qe;
> +       uint16_t sched_word[4];
> +       __m128i sse_qe[2];
> +       int i;
> +
> +       qe = qm_port->qe4;
> +
> +       sse_qe[0] = _mm_setzero_si128();
> +       sse_qe[1] = _mm_setzero_si128();
> +
> +       switch (num) {
> +       case 4:
> +               /* Construct the metadata portion of two HCWs in one 128b SSE
> +                * register. HCW metadata is constructed in the SSE registers
> +                * like so:
> +                * sse_qe[0][63:0]:   qe[0]'s metadata
> +                * sse_qe[0][127:64]: qe[1]'s metadata
> +                * sse_qe[1][63:0]:   qe[2]'s metadata
> +                * sse_qe[1][127:64]: qe[3]'s metadata
> +                */
> +
> +               /* Convert the event operation into a command byte and store it
> +                * in the metadata:
> +                * sse_qe[0][63:56]   = cmd_byte_map[is_directed][ev[0].op]
> +                * sse_qe[0][127:120] = cmd_byte_map[is_directed][ev[1].op]
> +                * sse_qe[1][63:56]   = cmd_byte_map[is_directed][ev[2].op]
> +                * sse_qe[1][127:120] = cmd_byte_map[is_directed][ev[3].op]
> +                */
> +#define DLB2_QE_CMD_BYTE 7
> +               sse_qe[0] = _mm_insert_epi8(sse_qe[0],
> +                               cmd_byte_map[qm_port->is_directed][ev[0].op],
> +                               DLB2_QE_CMD_BYTE);
> +               sse_qe[0] = _mm_insert_epi8(sse_qe[0],
> +                               cmd_byte_map[qm_port->is_directed][ev[1].op],
> +                               DLB2_QE_CMD_BYTE + 8);
> +               sse_qe[1] = _mm_insert_epi8(sse_qe[1],
> +                               cmd_byte_map[qm_port->is_directed][ev[2].op],
> +                               DLB2_QE_CMD_BYTE);
> +               sse_qe[1] = _mm_insert_epi8(sse_qe[1],
> +                               cmd_byte_map[qm_port->is_directed][ev[3].op],
> +                               DLB2_QE_CMD_BYTE + 8);
> +
> +               /* Store priority, scheduling type, and queue ID in the sched
> +                * word array because these values are re-used when the
> +                * destination is a directed queue.
> +                */
> +               sched_word[0] = EV_TO_DLB2_PRIO(ev[0].priority) << 10 |
> +                               sched_type[0] << 8 |
> +                               queue_id[0];
> +               sched_word[1] = EV_TO_DLB2_PRIO(ev[1].priority) << 10 |
> +                               sched_type[1] << 8 |
> +                               queue_id[1];
> +               sched_word[2] = EV_TO_DLB2_PRIO(ev[2].priority) << 10 |
> +                               sched_type[2] << 8 |
> +                               queue_id[2];
> +               sched_word[3] = EV_TO_DLB2_PRIO(ev[3].priority) << 10 |
> +                               sched_type[3] << 8 |
> +                               queue_id[3];
> +
> +               /* Store the event priority, scheduling type, and queue ID in
> +                * the metadata:
> +                * sse_qe[0][31:16] = sched_word[0]
> +                * sse_qe[0][95:80] = sched_word[1]
> +                * sse_qe[1][31:16] = sched_word[2]
> +                * sse_qe[1][95:80] = sched_word[3]
> +                */
> +#define DLB2_QE_QID_SCHED_WORD 1
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            sched_word[0],
> +                                            DLB2_QE_QID_SCHED_WORD);
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            sched_word[1],
> +                                            DLB2_QE_QID_SCHED_WORD + 4);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            sched_word[2],
> +                                            DLB2_QE_QID_SCHED_WORD);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            sched_word[3],
> +                                            DLB2_QE_QID_SCHED_WORD + 4);
> +
> +               /* If the destination is a load-balanced queue, store the lock
> +                * ID. If it is a directed queue, DLB places this field in
> +                * bytes 10-11 of the received QE, so we format it accordingly:
> +                * sse_qe[0][47:32]  = dir queue ? sched_word[0] : flow_id[0]
> +                * sse_qe[0][111:96] = dir queue ? sched_word[1] : flow_id[1]
> +                * sse_qe[1][47:32]  = dir queue ? sched_word[2] : flow_id[2]
> +                * sse_qe[1][111:96] = dir queue ? sched_word[3] : flow_id[3]
> +                */
> +#define DLB2_QE_LOCK_ID_WORD 2
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                               (sched_type[0] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[0] : ev[0].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD);
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                               (sched_type[1] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[1] : ev[1].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD + 4);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                               (sched_type[2] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[2] : ev[2].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                               (sched_type[3] == DLB2_SCHED_DIRECTED) ?
> +                                       sched_word[3] : ev[3].flow_id,
> +                               DLB2_QE_LOCK_ID_WORD + 4);
> +
> +               /* Store the event type and sub event type in the metadata:
> +                * sse_qe[0][15:0]  = flow_id[0]
> +                * sse_qe[0][79:64] = flow_id[1]
> +                * sse_qe[1][15:0]  = flow_id[2]
> +                * sse_qe[1][79:64] = flow_id[3]
> +                */
> +#define DLB2_QE_EV_TYPE_WORD 0
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            ev[0].sub_event_type << 8 |
> +                                               ev[0].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD);
> +               sse_qe[0] = _mm_insert_epi16(sse_qe[0],
> +                                            ev[1].sub_event_type << 8 |
> +                                               ev[1].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD + 4);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            ev[2].sub_event_type << 8 |
> +                                               ev[2].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD);
> +               sse_qe[1] = _mm_insert_epi16(sse_qe[1],
> +                                            ev[3].sub_event_type << 8 |
> +                                               ev[3].event_type,
> +                                            DLB2_QE_EV_TYPE_WORD + 4);
> +
> +               /*
> +                * Store the metadata to memory (use the double-precision
> +                * _mm_storeh_pd because there is no integer function for
> +                * storing the upper 64b):
> +                * qe[0] metadata = sse_qe[0][63:0]
> +                * qe[1] metadata = sse_qe[0][127:64]
> +                * qe[2] metadata = sse_qe[1][63:0]
> +                * qe[3] metadata = sse_qe[1][127:64]
> +                */
> +               _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data,
> +                                sse_qe[0]);
> +               _mm_storeh_pd((double *)&qe[1].u.opaque_data,
> +                             (__m128d)sse_qe[0]);
> +               _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data,
> +                                sse_qe[1]);
> +               _mm_storeh_pd((double *)&qe[3].u.opaque_data,
> +                                     (__m128d)sse_qe[1]);
> +
> +               qe[0].data = ev[0].u64;
> +               qe[1].data = ev[1].u64;
> +               qe[2].data = ev[2].u64;
> +               qe[3].data = ev[3].u64;
> +
> +               break;
> +       case 3:
> +       case 2:
> +       case 1:
> +               for (i = 0; i < num; i++) {
> +                       qe[i].cmd_byte =
> +                               cmd_byte_map[qm_port->is_directed][ev[i].op];
> +                       qe[i].sched_type = sched_type[i];
> +                       qe[i].data = ev[i].u64;
> +                       qe[i].qid = queue_id[i];
> +                       qe[i].priority = EV_TO_DLB2_PRIO(ev[i].priority);
> +                       qe[i].lock_id = ev[i].flow_id;
> +                       if (sched_type[i] == DLB2_SCHED_DIRECTED) {
> +                               struct dlb2_msg_info *info =
> +                                       (struct dlb2_msg_info *)&qe[i].lock_id;
> +
> +                               info->qid = queue_id[i];
> +                               info->sched_type = DLB2_SCHED_DIRECTED;
> +                               info->priority = qe[i].priority;
> +                       }
> +                       qe[i].u.event_type.major = ev[i].event_type;
> +                       qe[i].u.event_type.sub = ev[i].sub_event_type;
> +               }
> +               break;
> +       case 0:
> +               break;
> +       }
> +}
> diff --git a/drivers/event/dlb2/meson.build b/drivers/event/dlb2/meson.build
> index f963589fd3..c08f480570 100644
> --- a/drivers/event/dlb2/meson.build
> +++ b/drivers/event/dlb2/meson.build
> @@ -19,6 +19,50 @@ sources = files(
>          'dlb2_selftest.c',
>  )
>
> +# compile AVX512 version if:
> +# we are building 64-bit binary (checked above) AND binutils
> +# can generate proper code
> +
> +if binutils_ok
> +
> +    # compile AVX512 version if either:
> +    # a. we have AVX512VL supported in minimum instruction set
> +    #    baseline
> +    # b. it's not minimum instruction set, but supported by
> +    #    compiler
> +    #
> +    # in former case, just add avx512 C file to files list
> +    # in latter case, compile c file to static lib, using correct
> +    # compiler flags, and then have the .o file from static lib
> +    # linked into main lib.
> +
> +    # check if all required flags already enabled (variant a).
> +    dlb2_avx512_on = false
> +    if cc.get_define(f, args: machine_args) == '__AVX512VL__'
> +        dlb2_avx512_on = true
> +    endif
> +
> +    if dlb2_avx512_on == true
> +
> +        sources += files('dlb2_avx512.c')
> +        cflags += '-DCC_AVX512_SUPPORT'
> +
> +    elif cc.has_multi_arguments('-mavx512vl')
> +
> +        cflags += '-DCC_AVX512_SUPPORT'
> +        avx512_tmplib = static_library('avx512_tmp',
> +                               'dlb2_avx512.c',
> +                               dependencies: [static_rte_eal,
> +                                             static_rte_eventdev],
> +                               c_args: cflags + ['-mavx512vl'])
> +        objs += avx512_tmplib.extract_objects('dlb2_avx512.c')
> +    else
> +        sources += files('dlb2_sse.c')
> +    endif
> +else
> +        sources += files('dlb2_sse.c')
> +endif
> +
>  headers = files('rte_pmd_dlb2.h')
>
>  deps += ['mbuf', 'mempool', 'ring', 'pci', 'bus_pci']
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-06-14 10:40 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-09 15:18 [PATCH] event/dlb2: add support for single 512B write of 4 QEs Timothy McDaniel
2022-05-14 12:07 ` Jerin Jacob
2022-05-16  8:42   ` Bruce Richardson
2022-05-16 17:00   ` McDaniel, Timothy
2022-05-19 20:24 ` [PATCH v3] " Timothy McDaniel
2022-05-23 16:09 ` [PATCH v4] " Timothy McDaniel
2022-05-23 16:34   ` Bruce Richardson
2022-05-23 16:52     ` McDaniel, Timothy
2022-05-23 16:55       ` Bruce Richardson
2022-06-09 17:40         ` Jerin Jacob
2022-06-09 18:02           ` McDaniel, Timothy
2022-05-23 16:37   ` Bruce Richardson
2022-05-23 16:45     ` McDaniel, Timothy
2022-06-10 12:43 ` [PATCH v6] " Timothy McDaniel
2022-06-10 15:41 ` [PATCH v7] " Timothy McDaniel
2022-06-10 16:15   ` Bruce Richardson
2022-06-10 16:27 ` [PATCH v8] " Timothy McDaniel
2022-06-13  6:30   ` Jerin Jacob
2022-06-13 20:39 ` [PATCH v9] " Timothy McDaniel
2022-06-14 10:40   ` Jerin Jacob

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).