* [PATCH 1/1] mldev: split bfloat16 routines to separate files
@ 2023-03-13 11:43 Srikanth Yalavarthi
2023-03-13 12:03 ` [PATCH v2 " Srikanth Yalavarthi
0 siblings, 1 reply; 8+ messages in thread
From: Srikanth Yalavarthi @ 2023-03-13 11:43 UTC (permalink / raw)
To: Srikanth Yalavarthi, Ruifeng Wang; +Cc: dev, sshankarnara, david.marchand
Since bfloat16 intrinsics are not supported on all ARM platforms
that support NEON, bfloat16 routines are moved to separate files.
This would enable using scalar implementation for bfloat16 on
unsupported ARM platforms.
Bugzilla ID: 1179
Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
Depends-on: patch-120653 ("mldev: remove weak symbols use in type conversions")
lib/mldev/meson.build | 11 +-
lib/mldev/mldev_utils_neon.c | 142 +-----------
lib/mldev/mldev_utils_neon_bfloat16.c | 154 +++++++++++++
lib/mldev/mldev_utils_scalar.c | 278 +-----------------------
lib/mldev/mldev_utils_scalar.h | 80 +++++++
lib/mldev/mldev_utils_scalar_bfloat16.c | 197 +++++++++++++++++
6 files changed, 453 insertions(+), 409 deletions(-)
create mode 100644 lib/mldev/mldev_utils_neon_bfloat16.c
create mode 100644 lib/mldev/mldev_utils_scalar.h
create mode 100644 lib/mldev/mldev_utils_scalar_bfloat16.c
diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build
index c9db42257b..5769b0640a 100644
--- a/lib/mldev/meson.build
+++ b/lib/mldev/meson.build
@@ -7,12 +7,21 @@ sources = files(
'mldev_utils.c',
)
-if dpdk_conf.has('RTE_ARCH_ARM64')
+if (dpdk_conf.has('RTE_ARCH_ARM64') and
+ cc.get_define('__ARM_NEON', args: machine_args) != '')
sources += files('mldev_utils_neon.c')
else
sources += files('mldev_utils_scalar.c')
endif
+if (dpdk_conf.has('RTE_ARCH_ARM64') and
+ cc.get_define('__ARM_NEON', args: machine_args) != '' and
+ cc.get_define('__ARM_FEATURE_BF16', args: machine_args) != '')
+ sources += files('mldev_utils_neon_bfloat16.c')
+else
+ sources += files('mldev_utils_scalar_bfloat16.c')
+endif
+
headers = files(
'rte_mldev.h',
)
diff --git a/lib/mldev/mldev_utils_neon.c b/lib/mldev/mldev_utils_neon.c
index 32b620db20..c7baec012b 100644
--- a/lib/mldev/mldev_utils_neon.c
+++ b/lib/mldev/mldev_utils_neon.c
@@ -12,8 +12,8 @@
/* Description:
* This file implements vector versions of Machine Learning utility functions used to convert data
- * types from higher precision to lower precision and vice-versa. Implementation is based on Arm
- * Neon intrinsics.
+ * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
+ * is based on Arm Neon intrinsics.
*/
static inline void
@@ -733,141 +733,3 @@ rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
return 0;
}
-
-#ifdef __ARM_FEATURE_BF16
-
-static inline void
-__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)
-{
- float32x4_t f32x4;
- bfloat16x4_t bf16x4;
-
- /* load 4 x float32_t elements */
- f32x4 = vld1q_f32(input);
-
- /* convert float32x4_t to bfloat16x4_t */
- bf16x4 = vcvt_bf16_f32(f32x4);
-
- /* store bfloat16x4_t */
- vst1_bf16(output, bf16x4);
-}
-
-static inline void
-__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)
-{
- float32x4_t f32x4;
- bfloat16x4_t bf16x4;
-
- /* load element to 4 lanes */
- f32x4 = vld1q_dup_f32(input);
-
- /* convert float32_t to bfloat16_t */
- bf16x4 = vcvt_bf16_f32(f32x4);
-
- /* store lane 0 / 1 element */
- vst1_lane_bf16(output, bf16x4, 0);
-}
-
-int
-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
-{
- float32_t *input_buffer;
- bfloat16_t *output_buffer;
- uint64_t nb_iterations;
- uint32_t vlen;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (float32_t *)input;
- output_buffer = (bfloat16_t *)output;
- vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
- nb_iterations = nb_elements / vlen;
-
- /* convert vlen elements in each iteration */
- for (i = 0; i < nb_iterations; i++) {
- __float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);
- input_buffer += vlen;
- output_buffer += vlen;
- }
-
- /* convert leftover elements */
- i = i * vlen;
- for (; i < nb_elements; i++) {
- __float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);
- input_buffer++;
- output_buffer++;
- }
-
- return 0;
-}
-
-static inline void
-__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)
-{
- bfloat16x4_t bf16x4;
- float32x4_t f32x4;
-
- /* load 4 x bfloat16_t elements */
- bf16x4 = vld1_bf16(input);
-
- /* convert bfloat16x4_t to float32x4_t */
- f32x4 = vcvt_f32_bf16(bf16x4);
-
- /* store float32x4_t */
- vst1q_f32(output, f32x4);
-}
-
-static inline void
-__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)
-{
- bfloat16x4_t bf16x4;
- float32x4_t f32x4;
-
- /* load element to 4 lanes */
- bf16x4 = vld1_dup_bf16(input);
-
- /* convert bfloat16_t to float32_t */
- f32x4 = vcvt_f32_bf16(bf16x4);
-
- /* store lane 0 / 1 element */
- vst1q_lane_f32(output, f32x4, 0);
-}
-
-int
-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
-{
- bfloat16_t *input_buffer;
- float32_t *output_buffer;
- uint64_t nb_iterations;
- uint32_t vlen;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (bfloat16_t *)input;
- output_buffer = (float32_t *)output;
- vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
- nb_iterations = nb_elements / vlen;
-
- /* convert vlen elements in each iteration */
- for (i = 0; i < nb_iterations; i++) {
- __bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);
- input_buffer += vlen;
- output_buffer += vlen;
- }
-
- /* convert leftover elements */
- i = i * vlen;
- for (; i < nb_elements; i++) {
- __bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);
- input_buffer++;
- output_buffer++;
- }
-
- return 0;
-}
-
-#endif /* __ARM_FEATURE_BF16 */
diff --git a/lib/mldev/mldev_utils_neon_bfloat16.c b/lib/mldev/mldev_utils_neon_bfloat16.c
new file mode 100644
index 0000000000..8dec3fd834
--- /dev/null
+++ b/lib/mldev/mldev_utils_neon_bfloat16.c
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2023 Marvell.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "mldev_utils.h"
+
+#include <arm_neon.h>
+
+/* Description:
+ * This file implements vector versions of Machine Learning utility functions used to convert data
+ * types from bfloat16 to float and vice-versa. Implementation is based on Arm Neon intrinsics.
+ */
+
+#ifdef __ARM_FEATURE_BF16
+
+static inline void
+__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)
+{
+ float32x4_t f32x4;
+ bfloat16x4_t bf16x4;
+
+ /* load 4 x float32_t elements */
+ f32x4 = vld1q_f32(input);
+
+ /* convert float32x4_t to bfloat16x4_t */
+ bf16x4 = vcvt_bf16_f32(f32x4);
+
+ /* store bfloat16x4_t */
+ vst1_bf16(output, bf16x4);
+}
+
+static inline void
+__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)
+{
+ float32x4_t f32x4;
+ bfloat16x4_t bf16x4;
+
+ /* load element to 4 lanes */
+ f32x4 = vld1q_dup_f32(input);
+
+ /* convert float32_t to bfloat16_t */
+ bf16x4 = vcvt_bf16_f32(f32x4);
+
+ /* store lane 0 / 1 element */
+ vst1_lane_bf16(output, bf16x4, 0);
+}
+
+int
+rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
+{
+ float32_t *input_buffer;
+ bfloat16_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float32_t *)input;
+ output_buffer = (bfloat16_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)
+{
+ bfloat16x4_t bf16x4;
+ float32x4_t f32x4;
+
+ /* load 4 x bfloat16_t elements */
+ bf16x4 = vld1_bf16(input);
+
+ /* convert bfloat16x4_t to float32x4_t */
+ f32x4 = vcvt_f32_bf16(bf16x4);
+
+ /* store float32x4_t */
+ vst1q_f32(output, f32x4);
+}
+
+static inline void
+__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)
+{
+ bfloat16x4_t bf16x4;
+ float32x4_t f32x4;
+
+ /* load element to 4 lanes */
+ bf16x4 = vld1_dup_bf16(input);
+
+ /* convert bfloat16_t to float32_t */
+ f32x4 = vcvt_f32_bf16(bf16x4);
+
+ /* store lane 0 / 1 element */
+ vst1q_lane_f32(output, f32x4, 0);
+}
+
+int
+rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
+{
+ bfloat16_t *input_buffer;
+ float32_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (bfloat16_t *)input;
+ output_buffer = (float32_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+#endif /* __ARM_FEATURE_BF16 */
diff --git a/lib/mldev/mldev_utils_scalar.c b/lib/mldev/mldev_utils_scalar.c
index 322b009f5d..a345b1e73c 100644
--- a/lib/mldev/mldev_utils_scalar.c
+++ b/lib/mldev/mldev_utils_scalar.c
@@ -2,88 +2,13 @@
* Copyright (c) 2022 Marvell.
*/
-#include <errno.h>
-#include <math.h>
-#include <stdint.h>
-
-#include "mldev_utils.h"
+#include "mldev_utils_scalar.h"
/* Description:
* This file implements scalar versions of Machine Learning utility functions used to convert data
- * types from higher precision to lower precision and vice-versa.
+ * types from higher precision to lower precision and vice-versa, except bfloat16.
*/
-#ifndef BIT
-#define BIT(nr) (1UL << (nr))
-#endif
-
-#ifndef BITS_PER_LONG
-#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
-#endif
-
-#ifndef GENMASK_U32
-#define GENMASK_U32(h, l) (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
-#endif
-
-/* float32: bit index of MSB & LSB of sign, exponent and mantissa */
-#define FP32_LSB_M 0
-#define FP32_MSB_M 22
-#define FP32_LSB_E 23
-#define FP32_MSB_E 30
-#define FP32_LSB_S 31
-#define FP32_MSB_S 31
-
-/* float32: bitmask for sign, exponent and mantissa */
-#define FP32_MASK_S GENMASK_U32(FP32_MSB_S, FP32_LSB_S)
-#define FP32_MASK_E GENMASK_U32(FP32_MSB_E, FP32_LSB_E)
-#define FP32_MASK_M GENMASK_U32(FP32_MSB_M, FP32_LSB_M)
-
-/* float16: bit index of MSB & LSB of sign, exponent and mantissa */
-#define FP16_LSB_M 0
-#define FP16_MSB_M 9
-#define FP16_LSB_E 10
-#define FP16_MSB_E 14
-#define FP16_LSB_S 15
-#define FP16_MSB_S 15
-
-/* float16: bitmask for sign, exponent and mantissa */
-#define FP16_MASK_S GENMASK_U32(FP16_MSB_S, FP16_LSB_S)
-#define FP16_MASK_E GENMASK_U32(FP16_MSB_E, FP16_LSB_E)
-#define FP16_MASK_M GENMASK_U32(FP16_MSB_M, FP16_LSB_M)
-
-/* bfloat16: bit index of MSB & LSB of sign, exponent and mantissa */
-#define BF16_LSB_M 0
-#define BF16_MSB_M 6
-#define BF16_LSB_E 7
-#define BF16_MSB_E 14
-#define BF16_LSB_S 15
-#define BF16_MSB_S 15
-
-/* bfloat16: bitmask for sign, exponent and mantissa */
-#define BF16_MASK_S GENMASK_U32(BF16_MSB_S, BF16_LSB_S)
-#define BF16_MASK_E GENMASK_U32(BF16_MSB_E, BF16_LSB_E)
-#define BF16_MASK_M GENMASK_U32(BF16_MSB_M, BF16_LSB_M)
-
-/* Exponent bias */
-#define FP32_BIAS_E 127
-#define FP16_BIAS_E 15
-#define BF16_BIAS_E 127
-
-#define FP32_PACK(sign, exponent, mantissa) \
- (((sign) << FP32_LSB_S) | ((exponent) << FP32_LSB_E) | (mantissa))
-
-#define FP16_PACK(sign, exponent, mantissa) \
- (((sign) << FP16_LSB_S) | ((exponent) << FP16_LSB_E) | (mantissa))
-
-#define BF16_PACK(sign, exponent, mantissa) \
- (((sign) << BF16_LSB_S) | ((exponent) << BF16_LSB_E) | (mantissa))
-
-/* Represent float32 as float and uint32_t */
-union float32 {
- float f;
- uint32_t u;
-};
-
int
rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
{
@@ -334,18 +259,18 @@ __float32_to_float16_scalar_rtn(float x)
f16_m = 0;
switch (f32_e) {
- case (0): /* float32: zero or subnormal number */
+ case (0): /* float32: zero or subnormal number */
f16_e = 0;
if (f32_m == 0) /* zero */
f16_m = 0;
- else /* subnormal number, convert to zero */
+ else /* subnormal number, convert to zero */
f16_m = 0;
break;
case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */
f16_e = FP16_MASK_E >> FP16_LSB_E;
- if (f32_m == 0) { /* infinity */
+ if (f32_m == 0) { /* infinity */
f16_m = 0;
- } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */
+ } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */
f16_m = f32_m >> (FP32_MSB_M - FP16_MSB_M);
f16_m |= BIT(FP16_MSB_M);
}
@@ -477,20 +402,20 @@ __float16_to_float32_scalar_rtx(uint16_t f16)
switch (f16_e) {
case (FP16_MASK_E >> FP16_LSB_E): /* float16: infinity or nan */
f32_e = FP32_MASK_E >> FP32_LSB_E;
- if (f16_m == 0x0) { /* infinity */
+ if (f16_m == 0x0) { /* infinity */
f32_m = f16_m;
- } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */
+ } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */
f32_m = f16_m;
shift = FP32_MSB_M - FP16_MSB_M;
f32_m = (f32_m << shift) & FP32_MASK_M;
f32_m |= BIT(FP32_MSB_M);
}
break;
- case 0: /* float16: zero or sub-normal */
+ case 0: /* float16: zero or sub-normal */
f32_m = f16_m;
if (f16_m == 0) { /* zero signed */
f32_e = 0;
- } else { /* subnormal numbers */
+ } else { /* subnormal numbers */
clz = __builtin_clz((uint32_t)f16_m) - sizeof(uint32_t) * 8 + FP16_LSB_E;
e_16 = (int)f16_e - clz;
f32_e = FP32_BIAS_E + e_16 - FP16_BIAS_E;
@@ -535,186 +460,3 @@ rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
return 0;
}
-
-/* Convert a single precision floating point number (float32) into a
- * brain float number (bfloat16) using round to nearest rounding mode.
- */
-static uint16_t
-__float32_to_bfloat16_scalar_rtn(float x)
-{
- union float32 f32; /* float32 input */
- uint32_t f32_s; /* float32 sign */
- uint32_t f32_e; /* float32 exponent */
- uint32_t f32_m; /* float32 mantissa */
- uint16_t b16_s; /* float16 sign */
- uint16_t b16_e; /* float16 exponent */
- uint16_t b16_m; /* float16 mantissa */
- uint32_t tbits; /* number of truncated bits */
- uint16_t u16; /* float16 output */
-
- f32.f = x;
- f32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S;
- f32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E;
- f32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M;
-
- b16_s = f32_s;
- b16_e = 0;
- b16_m = 0;
-
- switch (f32_e) {
- case (0): /* float32: zero or subnormal number */
- b16_e = 0;
- if (f32_m == 0) /* zero */
- b16_m = 0;
- else /* subnormal float32 number, normal bfloat16 */
- goto bf16_normal;
- break;
- case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */
- b16_e = BF16_MASK_E >> BF16_LSB_E;
- if (f32_m == 0) { /* infinity */
- b16_m = 0;
- } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */
- b16_m = f32_m >> (FP32_MSB_M - BF16_MSB_M);
- b16_m |= BIT(BF16_MSB_M);
- }
- break;
- default: /* float32: normal number, normal bfloat16 */
- goto bf16_normal;
- }
-
- goto bf16_pack;
-
-bf16_normal:
- b16_e = f32_e;
- tbits = FP32_MSB_M - BF16_MSB_M;
- b16_m = f32_m >> tbits;
-
- /* if non-leading truncated bits are set */
- if ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) {
- b16_m++;
-
- /* if overflow into exponent */
- if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
- b16_e++;
- } else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) {
- /* if only leading truncated bit is set */
- if ((b16_m & 0x1) == 0x1) {
- b16_m++;
-
- /* if overflow into exponent */
- if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
- b16_e++;
- }
- }
- b16_m = b16_m & BF16_MASK_M;
-
-bf16_pack:
- u16 = BF16_PACK(b16_s, b16_e, b16_m);
-
- return u16;
-}
-
-int
-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
-{
- float *input_buffer;
- uint16_t *output_buffer;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (float *)input;
- output_buffer = (uint16_t *)output;
-
- for (i = 0; i < nb_elements; i++) {
- *output_buffer = __float32_to_bfloat16_scalar_rtn(*input_buffer);
-
- input_buffer = input_buffer + 1;
- output_buffer = output_buffer + 1;
- }
-
- return 0;
-}
-
-/* Convert a brain float number (bfloat16) into a
- * single precision floating point number (float32).
- */
-static float
-__bfloat16_to_float32_scalar_rtx(uint16_t f16)
-{
- union float32 f32; /* float32 output */
- uint16_t b16_s; /* float16 sign */
- uint16_t b16_e; /* float16 exponent */
- uint16_t b16_m; /* float16 mantissa */
- uint32_t f32_s; /* float32 sign */
- uint32_t f32_e; /* float32 exponent */
- uint32_t f32_m; /* float32 mantissa*/
- uint8_t shift; /* number of bits to be shifted */
-
- b16_s = (f16 & BF16_MASK_S) >> BF16_LSB_S;
- b16_e = (f16 & BF16_MASK_E) >> BF16_LSB_E;
- b16_m = (f16 & BF16_MASK_M) >> BF16_LSB_M;
-
- f32_s = b16_s;
- switch (b16_e) {
- case (BF16_MASK_E >> BF16_LSB_E): /* bfloat16: infinity or nan */
- f32_e = FP32_MASK_E >> FP32_LSB_E;
- if (b16_m == 0x0) { /* infinity */
- f32_m = 0;
- } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */
- f32_m = b16_m;
- shift = FP32_MSB_M - BF16_MSB_M;
- f32_m = (f32_m << shift) & FP32_MASK_M;
- f32_m |= BIT(FP32_MSB_M);
- }
- break;
- case 0: /* bfloat16: zero or subnormal */
- f32_m = b16_m;
- if (b16_m == 0) { /* zero signed */
- f32_e = 0;
- } else { /* subnormal numbers */
- goto fp32_normal;
- }
- break;
- default: /* bfloat16: normal number */
- goto fp32_normal;
- }
-
- goto fp32_pack;
-
-fp32_normal:
- f32_m = b16_m;
- f32_e = FP32_BIAS_E + b16_e - BF16_BIAS_E;
-
- shift = (FP32_MSB_M - BF16_MSB_M);
- f32_m = (f32_m << shift) & FP32_MASK_M;
-
-fp32_pack:
- f32.u = FP32_PACK(f32_s, f32_e, f32_m);
-
- return f32.f;
-}
-
-int
-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
-{
- uint16_t *input_buffer;
- float *output_buffer;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (uint16_t *)input;
- output_buffer = (float *)output;
-
- for (i = 0; i < nb_elements; i++) {
- *output_buffer = __bfloat16_to_float32_scalar_rtx(*input_buffer);
-
- input_buffer = input_buffer + 1;
- output_buffer = output_buffer + 1;
- }
-
- return 0;
-}
diff --git a/lib/mldev/mldev_utils_scalar.h b/lib/mldev/mldev_utils_scalar.h
new file mode 100644
index 0000000000..57e66ddb60
--- /dev/null
+++ b/lib/mldev/mldev_utils_scalar.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2023 Marvell.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "mldev_utils.h"
+
+#ifndef BIT
+#define BIT(nr) (1UL << (nr))
+#endif
+
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
+#endif
+
+#ifndef GENMASK_U32
+#define GENMASK_U32(h, l) (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+#endif
+
+/* float32: bit index of MSB & LSB of sign, exponent and mantissa */
+#define FP32_LSB_M 0
+#define FP32_MSB_M 22
+#define FP32_LSB_E 23
+#define FP32_MSB_E 30
+#define FP32_LSB_S 31
+#define FP32_MSB_S 31
+
+/* float32: bitmask for sign, exponent and mantissa */
+#define FP32_MASK_S GENMASK_U32(FP32_MSB_S, FP32_LSB_S)
+#define FP32_MASK_E GENMASK_U32(FP32_MSB_E, FP32_LSB_E)
+#define FP32_MASK_M GENMASK_U32(FP32_MSB_M, FP32_LSB_M)
+
+/* float16: bit index of MSB & LSB of sign, exponent and mantissa */
+#define FP16_LSB_M 0
+#define FP16_MSB_M 9
+#define FP16_LSB_E 10
+#define FP16_MSB_E 14
+#define FP16_LSB_S 15
+#define FP16_MSB_S 15
+
+/* float16: bitmask for sign, exponent and mantissa */
+#define FP16_MASK_S GENMASK_U32(FP16_MSB_S, FP16_LSB_S)
+#define FP16_MASK_E GENMASK_U32(FP16_MSB_E, FP16_LSB_E)
+#define FP16_MASK_M GENMASK_U32(FP16_MSB_M, FP16_LSB_M)
+
+/* bfloat16: bit index of MSB & LSB of sign, exponent and mantissa */
+#define BF16_LSB_M 0
+#define BF16_MSB_M 6
+#define BF16_LSB_E 7
+#define BF16_MSB_E 14
+#define BF16_LSB_S 15
+#define BF16_MSB_S 15
+
+/* bfloat16: bitmask for sign, exponent and mantissa */
+#define BF16_MASK_S GENMASK_U32(BF16_MSB_S, BF16_LSB_S)
+#define BF16_MASK_E GENMASK_U32(BF16_MSB_E, BF16_LSB_E)
+#define BF16_MASK_M GENMASK_U32(BF16_MSB_M, BF16_LSB_M)
+
+/* Exponent bias */
+#define FP32_BIAS_E 127
+#define FP16_BIAS_E 15
+#define BF16_BIAS_E 127
+
+#define FP32_PACK(sign, exponent, mantissa) \
+ (((sign) << FP32_LSB_S) | ((exponent) << FP32_LSB_E) | (mantissa))
+
+#define FP16_PACK(sign, exponent, mantissa) \
+ (((sign) << FP16_LSB_S) | ((exponent) << FP16_LSB_E) | (mantissa))
+
+#define BF16_PACK(sign, exponent, mantissa) \
+ (((sign) << BF16_LSB_S) | ((exponent) << BF16_LSB_E) | (mantissa))
+
+/* Represent float32 as float and uint32_t */
+union float32 {
+ float f;
+ uint32_t u;
+};
diff --git a/lib/mldev/mldev_utils_scalar_bfloat16.c b/lib/mldev/mldev_utils_scalar_bfloat16.c
new file mode 100644
index 0000000000..43f9431835
--- /dev/null
+++ b/lib/mldev/mldev_utils_scalar_bfloat16.c
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2023 Marvell.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "mldev_utils_scalar.h"
+
+/* Description:
+ * This file implements scalar versions of Machine Learning utility functions used to convert data
+ * types from bfloat16 to float32 and vice-versa.
+ */
+
+/* Convert a single precision floating point number (float32) into a
+ * brain float number (bfloat16) using round to nearest rounding mode.
+ */
+static uint16_t
+__float32_to_bfloat16_scalar_rtn(float x)
+{
+ union float32 f32; /* float32 input */
+ uint32_t f32_s; /* float32 sign */
+ uint32_t f32_e; /* float32 exponent */
+ uint32_t f32_m; /* float32 mantissa */
+ uint16_t b16_s; /* float16 sign */
+ uint16_t b16_e; /* float16 exponent */
+ uint16_t b16_m; /* float16 mantissa */
+ uint32_t tbits; /* number of truncated bits */
+ uint16_t u16; /* float16 output */
+
+ f32.f = x;
+ f32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S;
+ f32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E;
+ f32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M;
+
+ b16_s = f32_s;
+ b16_e = 0;
+ b16_m = 0;
+
+ switch (f32_e) {
+ case (0): /* float32: zero or subnormal number */
+ b16_e = 0;
+ if (f32_m == 0) /* zero */
+ b16_m = 0;
+ else /* subnormal float32 number, normal bfloat16 */
+ goto bf16_normal;
+ break;
+ case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */
+ b16_e = BF16_MASK_E >> BF16_LSB_E;
+ if (f32_m == 0) { /* infinity */
+ b16_m = 0;
+ } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */
+ b16_m = f32_m >> (FP32_MSB_M - BF16_MSB_M);
+ b16_m |= BIT(BF16_MSB_M);
+ }
+ break;
+ default: /* float32: normal number, normal bfloat16 */
+ goto bf16_normal;
+ }
+
+ goto bf16_pack;
+
+bf16_normal:
+ b16_e = f32_e;
+ tbits = FP32_MSB_M - BF16_MSB_M;
+ b16_m = f32_m >> tbits;
+
+ /* if non-leading truncated bits are set */
+ if ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) {
+ b16_m++;
+
+ /* if overflow into exponent */
+ if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
+ b16_e++;
+ } else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) {
+ /* if only leading truncated bit is set */
+ if ((b16_m & 0x1) == 0x1) {
+ b16_m++;
+
+ /* if overflow into exponent */
+ if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
+ b16_e++;
+ }
+ }
+ b16_m = b16_m & BF16_MASK_M;
+
+bf16_pack:
+ u16 = BF16_PACK(b16_s, b16_e, b16_m);
+
+ return u16;
+}
+
+int
+rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
+{
+ float *input_buffer;
+ uint16_t *output_buffer;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float *)input;
+ output_buffer = (uint16_t *)output;
+
+ for (i = 0; i < nb_elements; i++) {
+ *output_buffer = __float32_to_bfloat16_scalar_rtn(*input_buffer);
+
+ input_buffer = input_buffer + 1;
+ output_buffer = output_buffer + 1;
+ }
+
+ return 0;
+}
+
+/* Convert a brain float number (bfloat16) into a
+ * single precision floating point number (float32).
+ */
+static float
+__bfloat16_to_float32_scalar_rtx(uint16_t f16)
+{
+ union float32 f32; /* float32 output */
+ uint16_t b16_s; /* float16 sign */
+ uint16_t b16_e; /* float16 exponent */
+ uint16_t b16_m; /* float16 mantissa */
+ uint32_t f32_s; /* float32 sign */
+ uint32_t f32_e; /* float32 exponent */
+ uint32_t f32_m; /* float32 mantissa*/
+ uint8_t shift; /* number of bits to be shifted */
+
+ b16_s = (f16 & BF16_MASK_S) >> BF16_LSB_S;
+ b16_e = (f16 & BF16_MASK_E) >> BF16_LSB_E;
+ b16_m = (f16 & BF16_MASK_M) >> BF16_LSB_M;
+
+ f32_s = b16_s;
+ switch (b16_e) {
+ case (BF16_MASK_E >> BF16_LSB_E): /* bfloat16: infinity or nan */
+ f32_e = FP32_MASK_E >> FP32_LSB_E;
+ if (b16_m == 0x0) { /* infinity */
+ f32_m = 0;
+ } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */
+ f32_m = b16_m;
+ shift = FP32_MSB_M - BF16_MSB_M;
+ f32_m = (f32_m << shift) & FP32_MASK_M;
+ f32_m |= BIT(FP32_MSB_M);
+ }
+ break;
+ case 0: /* bfloat16: zero or subnormal */
+ f32_m = b16_m;
+ if (b16_m == 0) { /* zero signed */
+ f32_e = 0;
+ } else { /* subnormal numbers */
+ goto fp32_normal;
+ }
+ break;
+ default: /* bfloat16: normal number */
+ goto fp32_normal;
+ }
+
+ goto fp32_pack;
+
+fp32_normal:
+ f32_m = b16_m;
+ f32_e = FP32_BIAS_E + b16_e - BF16_BIAS_E;
+
+ shift = (FP32_MSB_M - BF16_MSB_M);
+ f32_m = (f32_m << shift) & FP32_MASK_M;
+
+fp32_pack:
+ f32.u = FP32_PACK(f32_s, f32_e, f32_m);
+
+ return f32.f;
+}
+
+int
+rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
+{
+ uint16_t *input_buffer;
+ float *output_buffer;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (uint16_t *)input;
+ output_buffer = (float *)output;
+
+ for (i = 0; i < nb_elements; i++) {
+ *output_buffer = __bfloat16_to_float32_scalar_rtx(*input_buffer);
+
+ input_buffer = input_buffer + 1;
+ output_buffer = output_buffer + 1;
+ }
+
+ return 0;
+}
--
2.17.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
2023-03-13 11:43 [PATCH 1/1] mldev: split bfloat16 routines to separate files Srikanth Yalavarthi
@ 2023-03-13 12:03 ` Srikanth Yalavarthi
2023-03-15 8:23 ` David Marchand
2023-03-15 10:02 ` Ruifeng Wang
0 siblings, 2 replies; 8+ messages in thread
From: Srikanth Yalavarthi @ 2023-03-13 12:03 UTC (permalink / raw)
To: Srikanth Yalavarthi, Ruifeng Wang; +Cc: dev, sshankarnara, david.marchand
Since bfloat16 intrinsics are not supported on all ARM platforms
that support NEON, bfloat16 routines are moved to separate files.
This would enable using scalar implementation for bfloat16 on
unsupported ARM platforms.
Bugzilla ID: 1179
Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
Depends-on: patch-120653 ("mldev: remove weak symbols use in type conversions")
Depends-on: patch-125035 ("mldev: fix identical code in conditional branches")
lib/mldev/meson.build | 11 +-
lib/mldev/mldev_utils_neon.c | 142 +------------
lib/mldev/mldev_utils_neon_bfloat16.c | 154 ++++++++++++++
lib/mldev/mldev_utils_scalar.c | 262 +-----------------------
lib/mldev/mldev_utils_scalar.h | 80 ++++++++
lib/mldev/mldev_utils_scalar_bfloat16.c | 197 ++++++++++++++++++
6 files changed, 445 insertions(+), 401 deletions(-)
create mode 100644 lib/mldev/mldev_utils_neon_bfloat16.c
create mode 100644 lib/mldev/mldev_utils_scalar.h
create mode 100644 lib/mldev/mldev_utils_scalar_bfloat16.c
diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build
index c9db42257b..5769b0640a 100644
--- a/lib/mldev/meson.build
+++ b/lib/mldev/meson.build
@@ -7,12 +7,21 @@ sources = files(
'mldev_utils.c',
)
-if dpdk_conf.has('RTE_ARCH_ARM64')
+if (dpdk_conf.has('RTE_ARCH_ARM64') and
+ cc.get_define('__ARM_NEON', args: machine_args) != '')
sources += files('mldev_utils_neon.c')
else
sources += files('mldev_utils_scalar.c')
endif
+if (dpdk_conf.has('RTE_ARCH_ARM64') and
+ cc.get_define('__ARM_NEON', args: machine_args) != '' and
+ cc.get_define('__ARM_FEATURE_BF16', args: machine_args) != '')
+ sources += files('mldev_utils_neon_bfloat16.c')
+else
+ sources += files('mldev_utils_scalar_bfloat16.c')
+endif
+
headers = files(
'rte_mldev.h',
)
diff --git a/lib/mldev/mldev_utils_neon.c b/lib/mldev/mldev_utils_neon.c
index 32b620db20..c7baec012b 100644
--- a/lib/mldev/mldev_utils_neon.c
+++ b/lib/mldev/mldev_utils_neon.c
@@ -12,8 +12,8 @@
/* Description:
* This file implements vector versions of Machine Learning utility functions used to convert data
- * types from higher precision to lower precision and vice-versa. Implementation is based on Arm
- * Neon intrinsics.
+ * types from higher precision to lower precision and vice-versa, except bfloat16. Implementation
+ * is based on Arm Neon intrinsics.
*/
static inline void
@@ -733,141 +733,3 @@ rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
return 0;
}
-
-#ifdef __ARM_FEATURE_BF16
-
-static inline void
-__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)
-{
- float32x4_t f32x4;
- bfloat16x4_t bf16x4;
-
- /* load 4 x float32_t elements */
- f32x4 = vld1q_f32(input);
-
- /* convert float32x4_t to bfloat16x4_t */
- bf16x4 = vcvt_bf16_f32(f32x4);
-
- /* store bfloat16x4_t */
- vst1_bf16(output, bf16x4);
-}
-
-static inline void
-__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)
-{
- float32x4_t f32x4;
- bfloat16x4_t bf16x4;
-
- /* load element to 4 lanes */
- f32x4 = vld1q_dup_f32(input);
-
- /* convert float32_t to bfloat16_t */
- bf16x4 = vcvt_bf16_f32(f32x4);
-
- /* store lane 0 / 1 element */
- vst1_lane_bf16(output, bf16x4, 0);
-}
-
-int
-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
-{
- float32_t *input_buffer;
- bfloat16_t *output_buffer;
- uint64_t nb_iterations;
- uint32_t vlen;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (float32_t *)input;
- output_buffer = (bfloat16_t *)output;
- vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
- nb_iterations = nb_elements / vlen;
-
- /* convert vlen elements in each iteration */
- for (i = 0; i < nb_iterations; i++) {
- __float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);
- input_buffer += vlen;
- output_buffer += vlen;
- }
-
- /* convert leftover elements */
- i = i * vlen;
- for (; i < nb_elements; i++) {
- __float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);
- input_buffer++;
- output_buffer++;
- }
-
- return 0;
-}
-
-static inline void
-__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)
-{
- bfloat16x4_t bf16x4;
- float32x4_t f32x4;
-
- /* load 4 x bfloat16_t elements */
- bf16x4 = vld1_bf16(input);
-
- /* convert bfloat16x4_t to float32x4_t */
- f32x4 = vcvt_f32_bf16(bf16x4);
-
- /* store float32x4_t */
- vst1q_f32(output, f32x4);
-}
-
-static inline void
-__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)
-{
- bfloat16x4_t bf16x4;
- float32x4_t f32x4;
-
- /* load element to 4 lanes */
- bf16x4 = vld1_dup_bf16(input);
-
- /* convert bfloat16_t to float32_t */
- f32x4 = vcvt_f32_bf16(bf16x4);
-
- /* store lane 0 / 1 element */
- vst1q_lane_f32(output, f32x4, 0);
-}
-
-int
-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
-{
- bfloat16_t *input_buffer;
- float32_t *output_buffer;
- uint64_t nb_iterations;
- uint32_t vlen;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (bfloat16_t *)input;
- output_buffer = (float32_t *)output;
- vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
- nb_iterations = nb_elements / vlen;
-
- /* convert vlen elements in each iteration */
- for (i = 0; i < nb_iterations; i++) {
- __bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);
- input_buffer += vlen;
- output_buffer += vlen;
- }
-
- /* convert leftover elements */
- i = i * vlen;
- for (; i < nb_elements; i++) {
- __bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);
- input_buffer++;
- output_buffer++;
- }
-
- return 0;
-}
-
-#endif /* __ARM_FEATURE_BF16 */
diff --git a/lib/mldev/mldev_utils_neon_bfloat16.c b/lib/mldev/mldev_utils_neon_bfloat16.c
new file mode 100644
index 0000000000..8dec3fd834
--- /dev/null
+++ b/lib/mldev/mldev_utils_neon_bfloat16.c
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2023 Marvell.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "mldev_utils.h"
+
+#include <arm_neon.h>
+
+/* Description:
+ * This file implements vector versions of Machine Learning utility functions used to convert data
+ * types from bfloat16 to float and vice-versa. Implementation is based on Arm Neon intrinsics.
+ */
+
+#ifdef __ARM_FEATURE_BF16
+
+static inline void
+__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)
+{
+ float32x4_t f32x4;
+ bfloat16x4_t bf16x4;
+
+ /* load 4 x float32_t elements */
+ f32x4 = vld1q_f32(input);
+
+ /* convert float32x4_t to bfloat16x4_t */
+ bf16x4 = vcvt_bf16_f32(f32x4);
+
+ /* store bfloat16x4_t */
+ vst1_bf16(output, bf16x4);
+}
+
+static inline void
+__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)
+{
+ float32x4_t f32x4;
+ bfloat16x4_t bf16x4;
+
+ /* load element to 4 lanes */
+ f32x4 = vld1q_dup_f32(input);
+
+ /* convert float32_t to bfloat16_t */
+ bf16x4 = vcvt_bf16_f32(f32x4);
+
+ /* store lane 0 / 1 element */
+ vst1_lane_bf16(output, bf16x4, 0);
+}
+
+int
+rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
+{
+ float32_t *input_buffer;
+ bfloat16_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float32_t *)input;
+ output_buffer = (bfloat16_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)
+{
+ bfloat16x4_t bf16x4;
+ float32x4_t f32x4;
+
+ /* load 4 x bfloat16_t elements */
+ bf16x4 = vld1_bf16(input);
+
+ /* convert bfloat16x4_t to float32x4_t */
+ f32x4 = vcvt_f32_bf16(bf16x4);
+
+ /* store float32x4_t */
+ vst1q_f32(output, f32x4);
+}
+
+static inline void
+__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)
+{
+ bfloat16x4_t bf16x4;
+ float32x4_t f32x4;
+
+ /* load element to 4 lanes */
+ bf16x4 = vld1_dup_bf16(input);
+
+ /* convert bfloat16_t to float32_t */
+ f32x4 = vcvt_f32_bf16(bf16x4);
+
+ /* store lane 0 / 1 element */
+ vst1q_lane_f32(output, f32x4, 0);
+}
+
+int
+rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
+{
+ bfloat16_t *input_buffer;
+ float32_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (bfloat16_t *)input;
+ output_buffer = (float32_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+#endif /* __ARM_FEATURE_BF16 */
diff --git a/lib/mldev/mldev_utils_scalar.c b/lib/mldev/mldev_utils_scalar.c
index b76a8fb326..92be5daee8 100644
--- a/lib/mldev/mldev_utils_scalar.c
+++ b/lib/mldev/mldev_utils_scalar.c
@@ -2,88 +2,13 @@
* Copyright (c) 2022 Marvell.
*/
-#include <errno.h>
-#include <math.h>
-#include <stdint.h>
-
-#include "mldev_utils.h"
+#include "mldev_utils_scalar.h"
/* Description:
* This file implements scalar versions of Machine Learning utility functions used to convert data
- * types from higher precision to lower precision and vice-versa.
+ * types from higher precision to lower precision and vice-versa, except bfloat16.
*/
-#ifndef BIT
-#define BIT(nr) (1UL << (nr))
-#endif
-
-#ifndef BITS_PER_LONG
-#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
-#endif
-
-#ifndef GENMASK_U32
-#define GENMASK_U32(h, l) (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
-#endif
-
-/* float32: bit index of MSB & LSB of sign, exponent and mantissa */
-#define FP32_LSB_M 0
-#define FP32_MSB_M 22
-#define FP32_LSB_E 23
-#define FP32_MSB_E 30
-#define FP32_LSB_S 31
-#define FP32_MSB_S 31
-
-/* float32: bitmask for sign, exponent and mantissa */
-#define FP32_MASK_S GENMASK_U32(FP32_MSB_S, FP32_LSB_S)
-#define FP32_MASK_E GENMASK_U32(FP32_MSB_E, FP32_LSB_E)
-#define FP32_MASK_M GENMASK_U32(FP32_MSB_M, FP32_LSB_M)
-
-/* float16: bit index of MSB & LSB of sign, exponent and mantissa */
-#define FP16_LSB_M 0
-#define FP16_MSB_M 9
-#define FP16_LSB_E 10
-#define FP16_MSB_E 14
-#define FP16_LSB_S 15
-#define FP16_MSB_S 15
-
-/* float16: bitmask for sign, exponent and mantissa */
-#define FP16_MASK_S GENMASK_U32(FP16_MSB_S, FP16_LSB_S)
-#define FP16_MASK_E GENMASK_U32(FP16_MSB_E, FP16_LSB_E)
-#define FP16_MASK_M GENMASK_U32(FP16_MSB_M, FP16_LSB_M)
-
-/* bfloat16: bit index of MSB & LSB of sign, exponent and mantissa */
-#define BF16_LSB_M 0
-#define BF16_MSB_M 6
-#define BF16_LSB_E 7
-#define BF16_MSB_E 14
-#define BF16_LSB_S 15
-#define BF16_MSB_S 15
-
-/* bfloat16: bitmask for sign, exponent and mantissa */
-#define BF16_MASK_S GENMASK_U32(BF16_MSB_S, BF16_LSB_S)
-#define BF16_MASK_E GENMASK_U32(BF16_MSB_E, BF16_LSB_E)
-#define BF16_MASK_M GENMASK_U32(BF16_MSB_M, BF16_LSB_M)
-
-/* Exponent bias */
-#define FP32_BIAS_E 127
-#define FP16_BIAS_E 15
-#define BF16_BIAS_E 127
-
-#define FP32_PACK(sign, exponent, mantissa) \
- (((sign) << FP32_LSB_S) | ((exponent) << FP32_LSB_E) | (mantissa))
-
-#define FP16_PACK(sign, exponent, mantissa) \
- (((sign) << FP16_LSB_S) | ((exponent) << FP16_LSB_E) | (mantissa))
-
-#define BF16_PACK(sign, exponent, mantissa) \
- (((sign) << BF16_LSB_S) | ((exponent) << BF16_LSB_E) | (mantissa))
-
-/* Represent float32 as float and uint32_t */
-union float32 {
- float f;
- uint32_t u;
-};
-
int
rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
{
@@ -532,186 +457,3 @@ rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
return 0;
}
-
-/* Convert a single precision floating point number (float32) into a
- * brain float number (bfloat16) using round to nearest rounding mode.
- */
-static uint16_t
-__float32_to_bfloat16_scalar_rtn(float x)
-{
- union float32 f32; /* float32 input */
- uint32_t f32_s; /* float32 sign */
- uint32_t f32_e; /* float32 exponent */
- uint32_t f32_m; /* float32 mantissa */
- uint16_t b16_s; /* float16 sign */
- uint16_t b16_e; /* float16 exponent */
- uint16_t b16_m; /* float16 mantissa */
- uint32_t tbits; /* number of truncated bits */
- uint16_t u16; /* float16 output */
-
- f32.f = x;
- f32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S;
- f32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E;
- f32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M;
-
- b16_s = f32_s;
- b16_e = 0;
- b16_m = 0;
-
- switch (f32_e) {
- case (0): /* float32: zero or subnormal number */
- b16_e = 0;
- if (f32_m == 0) /* zero */
- b16_m = 0;
- else /* subnormal float32 number, normal bfloat16 */
- goto bf16_normal;
- break;
- case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */
- b16_e = BF16_MASK_E >> BF16_LSB_E;
- if (f32_m == 0) { /* infinity */
- b16_m = 0;
- } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */
- b16_m = f32_m >> (FP32_MSB_M - BF16_MSB_M);
- b16_m |= BIT(BF16_MSB_M);
- }
- break;
- default: /* float32: normal number, normal bfloat16 */
- goto bf16_normal;
- }
-
- goto bf16_pack;
-
-bf16_normal:
- b16_e = f32_e;
- tbits = FP32_MSB_M - BF16_MSB_M;
- b16_m = f32_m >> tbits;
-
- /* if non-leading truncated bits are set */
- if ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) {
- b16_m++;
-
- /* if overflow into exponent */
- if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
- b16_e++;
- } else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) {
- /* if only leading truncated bit is set */
- if ((b16_m & 0x1) == 0x1) {
- b16_m++;
-
- /* if overflow into exponent */
- if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
- b16_e++;
- }
- }
- b16_m = b16_m & BF16_MASK_M;
-
-bf16_pack:
- u16 = BF16_PACK(b16_s, b16_e, b16_m);
-
- return u16;
-}
-
-int
-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
-{
- float *input_buffer;
- uint16_t *output_buffer;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (float *)input;
- output_buffer = (uint16_t *)output;
-
- for (i = 0; i < nb_elements; i++) {
- *output_buffer = __float32_to_bfloat16_scalar_rtn(*input_buffer);
-
- input_buffer = input_buffer + 1;
- output_buffer = output_buffer + 1;
- }
-
- return 0;
-}
-
-/* Convert a brain float number (bfloat16) into a
- * single precision floating point number (float32).
- */
-static float
-__bfloat16_to_float32_scalar_rtx(uint16_t f16)
-{
- union float32 f32; /* float32 output */
- uint16_t b16_s; /* float16 sign */
- uint16_t b16_e; /* float16 exponent */
- uint16_t b16_m; /* float16 mantissa */
- uint32_t f32_s; /* float32 sign */
- uint32_t f32_e; /* float32 exponent */
- uint32_t f32_m; /* float32 mantissa*/
- uint8_t shift; /* number of bits to be shifted */
-
- b16_s = (f16 & BF16_MASK_S) >> BF16_LSB_S;
- b16_e = (f16 & BF16_MASK_E) >> BF16_LSB_E;
- b16_m = (f16 & BF16_MASK_M) >> BF16_LSB_M;
-
- f32_s = b16_s;
- switch (b16_e) {
- case (BF16_MASK_E >> BF16_LSB_E): /* bfloat16: infinity or nan */
- f32_e = FP32_MASK_E >> FP32_LSB_E;
- if (b16_m == 0x0) { /* infinity */
- f32_m = 0;
- } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */
- f32_m = b16_m;
- shift = FP32_MSB_M - BF16_MSB_M;
- f32_m = (f32_m << shift) & FP32_MASK_M;
- f32_m |= BIT(FP32_MSB_M);
- }
- break;
- case 0: /* bfloat16: zero or subnormal */
- f32_m = b16_m;
- if (b16_m == 0) { /* zero signed */
- f32_e = 0;
- } else { /* subnormal numbers */
- goto fp32_normal;
- }
- break;
- default: /* bfloat16: normal number */
- goto fp32_normal;
- }
-
- goto fp32_pack;
-
-fp32_normal:
- f32_m = b16_m;
- f32_e = FP32_BIAS_E + b16_e - BF16_BIAS_E;
-
- shift = (FP32_MSB_M - BF16_MSB_M);
- f32_m = (f32_m << shift) & FP32_MASK_M;
-
-fp32_pack:
- f32.u = FP32_PACK(f32_s, f32_e, f32_m);
-
- return f32.f;
-}
-
-int
-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
-{
- uint16_t *input_buffer;
- float *output_buffer;
- uint64_t i;
-
- if ((nb_elements == 0) || (input == NULL) || (output == NULL))
- return -EINVAL;
-
- input_buffer = (uint16_t *)input;
- output_buffer = (float *)output;
-
- for (i = 0; i < nb_elements; i++) {
- *output_buffer = __bfloat16_to_float32_scalar_rtx(*input_buffer);
-
- input_buffer = input_buffer + 1;
- output_buffer = output_buffer + 1;
- }
-
- return 0;
-}
diff --git a/lib/mldev/mldev_utils_scalar.h b/lib/mldev/mldev_utils_scalar.h
new file mode 100644
index 0000000000..57e66ddb60
--- /dev/null
+++ b/lib/mldev/mldev_utils_scalar.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2023 Marvell.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "mldev_utils.h"
+
+#ifndef BIT
+#define BIT(nr) (1UL << (nr))
+#endif
+
+#ifndef BITS_PER_LONG
+#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
+#endif
+
+#ifndef GENMASK_U32
+#define GENMASK_U32(h, l) (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+#endif
+
+/* float32: bit index of MSB & LSB of sign, exponent and mantissa */
+#define FP32_LSB_M 0
+#define FP32_MSB_M 22
+#define FP32_LSB_E 23
+#define FP32_MSB_E 30
+#define FP32_LSB_S 31
+#define FP32_MSB_S 31
+
+/* float32: bitmask for sign, exponent and mantissa */
+#define FP32_MASK_S GENMASK_U32(FP32_MSB_S, FP32_LSB_S)
+#define FP32_MASK_E GENMASK_U32(FP32_MSB_E, FP32_LSB_E)
+#define FP32_MASK_M GENMASK_U32(FP32_MSB_M, FP32_LSB_M)
+
+/* float16: bit index of MSB & LSB of sign, exponent and mantissa */
+#define FP16_LSB_M 0
+#define FP16_MSB_M 9
+#define FP16_LSB_E 10
+#define FP16_MSB_E 14
+#define FP16_LSB_S 15
+#define FP16_MSB_S 15
+
+/* float16: bitmask for sign, exponent and mantissa */
+#define FP16_MASK_S GENMASK_U32(FP16_MSB_S, FP16_LSB_S)
+#define FP16_MASK_E GENMASK_U32(FP16_MSB_E, FP16_LSB_E)
+#define FP16_MASK_M GENMASK_U32(FP16_MSB_M, FP16_LSB_M)
+
+/* bfloat16: bit index of MSB & LSB of sign, exponent and mantissa */
+#define BF16_LSB_M 0
+#define BF16_MSB_M 6
+#define BF16_LSB_E 7
+#define BF16_MSB_E 14
+#define BF16_LSB_S 15
+#define BF16_MSB_S 15
+
+/* bfloat16: bitmask for sign, exponent and mantissa */
+#define BF16_MASK_S GENMASK_U32(BF16_MSB_S, BF16_LSB_S)
+#define BF16_MASK_E GENMASK_U32(BF16_MSB_E, BF16_LSB_E)
+#define BF16_MASK_M GENMASK_U32(BF16_MSB_M, BF16_LSB_M)
+
+/* Exponent bias */
+#define FP32_BIAS_E 127
+#define FP16_BIAS_E 15
+#define BF16_BIAS_E 127
+
+#define FP32_PACK(sign, exponent, mantissa) \
+ (((sign) << FP32_LSB_S) | ((exponent) << FP32_LSB_E) | (mantissa))
+
+#define FP16_PACK(sign, exponent, mantissa) \
+ (((sign) << FP16_LSB_S) | ((exponent) << FP16_LSB_E) | (mantissa))
+
+#define BF16_PACK(sign, exponent, mantissa) \
+ (((sign) << BF16_LSB_S) | ((exponent) << BF16_LSB_E) | (mantissa))
+
+/* Represent float32 as float and uint32_t */
+union float32 {
+ float f;
+ uint32_t u;
+};
diff --git a/lib/mldev/mldev_utils_scalar_bfloat16.c b/lib/mldev/mldev_utils_scalar_bfloat16.c
new file mode 100644
index 0000000000..1437416313
--- /dev/null
+++ b/lib/mldev/mldev_utils_scalar_bfloat16.c
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2023 Marvell.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "mldev_utils_scalar.h"
+
+/* Description:
+ * This file implements scalar versions of Machine Learning utility functions used to convert data
+ * types from bfloat16 to float32 and vice-versa.
+ */
+
+/* Convert a single precision floating point number (float32) into a
+ * brain float number (bfloat16) using round to nearest rounding mode.
+ */
+static uint16_t
+__float32_to_bfloat16_scalar_rtn(float x)
+{
+ union float32 f32; /* float32 input */
+ uint32_t f32_s; /* float32 sign */
+ uint32_t f32_e; /* float32 exponent */
+ uint32_t f32_m; /* float32 mantissa */
+ uint16_t b16_s; /* float16 sign */
+ uint16_t b16_e; /* float16 exponent */
+ uint16_t b16_m; /* float16 mantissa */
+ uint32_t tbits; /* number of truncated bits */
+ uint16_t u16; /* float16 output */
+
+ f32.f = x;
+ f32_s = (f32.u & FP32_MASK_S) >> FP32_LSB_S;
+ f32_e = (f32.u & FP32_MASK_E) >> FP32_LSB_E;
+ f32_m = (f32.u & FP32_MASK_M) >> FP32_LSB_M;
+
+ b16_s = f32_s;
+ b16_e = 0;
+ b16_m = 0;
+
+ switch (f32_e) {
+ case (0): /* float32: zero or subnormal number */
+ b16_e = 0;
+ if (f32_m == 0) /* zero */
+ b16_m = 0;
+ else /* subnormal float32 number, normal bfloat16 */
+ goto bf16_normal;
+ break;
+ case (FP32_MASK_E >> FP32_LSB_E): /* float32: infinity or nan */
+ b16_e = BF16_MASK_E >> BF16_LSB_E;
+ if (f32_m == 0) { /* infinity */
+ b16_m = 0;
+ } else { /* nan, propagate mantissa and set MSB of mantissa to 1 */
+ b16_m = f32_m >> (FP32_MSB_M - BF16_MSB_M);
+ b16_m |= BIT(BF16_MSB_M);
+ }
+ break;
+ default: /* float32: normal number, normal bfloat16 */
+ goto bf16_normal;
+ }
+
+ goto bf16_pack;
+
+bf16_normal:
+ b16_e = f32_e;
+ tbits = FP32_MSB_M - BF16_MSB_M;
+ b16_m = f32_m >> tbits;
+
+ /* if non-leading truncated bits are set */
+ if ((f32_m & GENMASK_U32(tbits - 1, 0)) > BIT(tbits - 1)) {
+ b16_m++;
+
+ /* if overflow into exponent */
+ if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
+ b16_e++;
+ } else if ((f32_m & GENMASK_U32(tbits - 1, 0)) == BIT(tbits - 1)) {
+ /* if only leading truncated bit is set */
+ if ((b16_m & 0x1) == 0x1) {
+ b16_m++;
+
+ /* if overflow into exponent */
+ if (((b16_m & BF16_MASK_E) >> BF16_LSB_E) == 0x1)
+ b16_e++;
+ }
+ }
+ b16_m = b16_m & BF16_MASK_M;
+
+bf16_pack:
+ u16 = BF16_PACK(b16_s, b16_e, b16_m);
+
+ return u16;
+}
+
+int
+rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
+{
+ float *input_buffer;
+ uint16_t *output_buffer;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float *)input;
+ output_buffer = (uint16_t *)output;
+
+ for (i = 0; i < nb_elements; i++) {
+ *output_buffer = __float32_to_bfloat16_scalar_rtn(*input_buffer);
+
+ input_buffer = input_buffer + 1;
+ output_buffer = output_buffer + 1;
+ }
+
+ return 0;
+}
+
+/* Convert a brain float number (bfloat16) into a
+ * single precision floating point number (float32).
+ */
+static float
+__bfloat16_to_float32_scalar_rtx(uint16_t f16)
+{
+ union float32 f32; /* float32 output */
+ uint16_t b16_s; /* float16 sign */
+ uint16_t b16_e; /* float16 exponent */
+ uint16_t b16_m; /* float16 mantissa */
+ uint32_t f32_s; /* float32 sign */
+ uint32_t f32_e; /* float32 exponent */
+ uint32_t f32_m; /* float32 mantissa*/
+ uint8_t shift; /* number of bits to be shifted */
+
+ b16_s = (f16 & BF16_MASK_S) >> BF16_LSB_S;
+ b16_e = (f16 & BF16_MASK_E) >> BF16_LSB_E;
+ b16_m = (f16 & BF16_MASK_M) >> BF16_LSB_M;
+
+ f32_s = b16_s;
+ switch (b16_e) {
+ case (BF16_MASK_E >> BF16_LSB_E): /* bfloat16: infinity or nan */
+ f32_e = FP32_MASK_E >> FP32_LSB_E;
+ if (b16_m == 0x0) { /* infinity */
+ f32_m = 0;
+ } else { /* nan, propagate mantissa, set MSB of mantissa to 1 */
+ f32_m = b16_m;
+ shift = FP32_MSB_M - BF16_MSB_M;
+ f32_m = (f32_m << shift) & FP32_MASK_M;
+ f32_m |= BIT(FP32_MSB_M);
+ }
+ break;
+ case 0: /* bfloat16: zero or subnormal */
+ f32_m = b16_m;
+ if (b16_m == 0) { /* zero signed */
+ f32_e = 0;
+ } else { /* subnormal numbers */
+ goto fp32_normal;
+ }
+ break;
+ default: /* bfloat16: normal number */
+ goto fp32_normal;
+ }
+
+ goto fp32_pack;
+
+fp32_normal:
+ f32_m = b16_m;
+ f32_e = FP32_BIAS_E + b16_e - BF16_BIAS_E;
+
+ shift = (FP32_MSB_M - BF16_MSB_M);
+ f32_m = (f32_m << shift) & FP32_MASK_M;
+
+fp32_pack:
+ f32.u = FP32_PACK(f32_s, f32_e, f32_m);
+
+ return f32.f;
+}
+
+int
+rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
+{
+ uint16_t *input_buffer;
+ float *output_buffer;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (uint16_t *)input;
+ output_buffer = (float *)output;
+
+ for (i = 0; i < nb_elements; i++) {
+ *output_buffer = __bfloat16_to_float32_scalar_rtx(*input_buffer);
+
+ input_buffer = input_buffer + 1;
+ output_buffer = output_buffer + 1;
+ }
+
+ return 0;
+}
--
2.17.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
2023-03-13 12:03 ` [PATCH v2 " Srikanth Yalavarthi
@ 2023-03-15 8:23 ` David Marchand
2023-03-15 10:02 ` Ruifeng Wang
1 sibling, 0 replies; 8+ messages in thread
From: David Marchand @ 2023-03-15 8:23 UTC (permalink / raw)
To: Ruifeng Wang, Jerin Jacob Kollanukkaran
Cc: dev, sshankarnara, Srikanth Yalavarthi, Honnappa Nagarahalli
Hello,
On Mon, Mar 13, 2023 at 1:03 PM Srikanth Yalavarthi
<syalavarthi@marvell.com> wrote:
>
> Since bfloat16 intrinsics are not supported on all ARM platforms
> that support NEON, bfloat16 routines are moved to separate files.
> This would enable using scalar implementation for bfloat16 on
> unsupported ARM platforms.
>
> Bugzilla ID: 1179
> Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
>
> Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
Please, can you review this patch?
Thanks.
--
David Marchand
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
2023-03-13 12:03 ` [PATCH v2 " Srikanth Yalavarthi
2023-03-15 8:23 ` David Marchand
@ 2023-03-15 10:02 ` Ruifeng Wang
2023-03-15 10:41 ` Srikanth Yalavarthi
1 sibling, 1 reply; 8+ messages in thread
From: Ruifeng Wang @ 2023-03-15 10:02 UTC (permalink / raw)
To: Srikanth Yalavarthi; +Cc: dev, sshankarnara, david.marchand, nd
> -----Original Message-----
> From: Srikanth Yalavarthi <syalavarthi@marvell.com>
> Sent: Monday, March 13, 2023 8:03 PM
> To: Srikanth Yalavarthi <syalavarthi@marvell.com>; Ruifeng Wang <Ruifeng.Wang@arm.com>
> Cc: dev@dpdk.org; sshankarnara@marvell.com; david.marchand@redhat.com
> Subject: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
>
> Since bfloat16 intrinsics are not supported on all ARM platforms that support NEON,
> bfloat16 routines are moved to separate files.
> This would enable using scalar implementation for bfloat16 on unsupported ARM platforms.
>
> Bugzilla ID: 1179
> Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
>
> Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
> ---
> Depends-on: patch-120653 ("mldev: remove weak symbols use in type conversions")
> Depends-on: patch-125035 ("mldev: fix identical code in conditional branches")
>
> lib/mldev/meson.build | 11 +-
> lib/mldev/mldev_utils_neon.c | 142 +------------
> lib/mldev/mldev_utils_neon_bfloat16.c | 154 ++++++++++++++
> lib/mldev/mldev_utils_scalar.c | 262 +-----------------------
> lib/mldev/mldev_utils_scalar.h | 80 ++++++++
> lib/mldev/mldev_utils_scalar_bfloat16.c | 197 ++++++++++++++++++
> 6 files changed, 445 insertions(+), 401 deletions(-) create mode 100644
> lib/mldev/mldev_utils_neon_bfloat16.c
> create mode 100644 lib/mldev/mldev_utils_scalar.h create mode 100644
> lib/mldev/mldev_utils_scalar_bfloat16.c
>
> diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build index c9db42257b..5769b0640a
> 100644
> --- a/lib/mldev/meson.build
> +++ b/lib/mldev/meson.build
> @@ -7,12 +7,21 @@ sources = files(
> 'mldev_utils.c',
> )
>
> -if dpdk_conf.has('RTE_ARCH_ARM64')
> +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> + cc.get_define('__ARM_NEON', args: machine_args) != '')
I found in ACLE document that "__ARM_NEON" is always set to 1 for AArch64".
So this line of check is redundant?
> sources += files('mldev_utils_neon.c') else
> sources += files('mldev_utils_scalar.c') endif
>
> +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> + cc.get_define('__ARM_NEON', args: machine_args) != '' and
Same here.
> + cc.get_define('__ARM_FEATURE_BF16', args: machine_args) != '')
> + sources += files('mldev_utils_neon_bfloat16.c')
> +else
> + sources += files('mldev_utils_scalar_bfloat16.c')
> +endif
> +
> headers = files(
> 'rte_mldev.h',
> )
<snip>
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
2023-03-15 10:02 ` Ruifeng Wang
@ 2023-03-15 10:41 ` Srikanth Yalavarthi
2023-03-15 13:39 ` Srikanth Yalavarthi
0 siblings, 1 reply; 8+ messages in thread
From: Srikanth Yalavarthi @ 2023-03-15 10:41 UTC (permalink / raw)
To: Ruifeng Wang
Cc: dev, Shivah Shankar Shankar Narayan Rao, david.marchand, nd,
Srikanth Yalavarthi
> -----Original Message-----
> From: Ruifeng Wang <Ruifeng.Wang@arm.com>
> Sent: 15 March 2023 15:32
> To: Srikanth Yalavarthi <syalavarthi@marvell.com>
> Cc: dev@dpdk.org; Shivah Shankar Shankar Narayan Rao
> <sshankarnara@marvell.com>; david.marchand@redhat.com; nd
> <nd@arm.com>
> Subject: [EXT] RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate
> files
>
> External Email
>
> ----------------------------------------------------------------------
> > -----Original Message-----
> > From: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > Sent: Monday, March 13, 2023 8:03 PM
> > To: Srikanth Yalavarthi <syalavarthi@marvell.com>; Ruifeng Wang
> > <Ruifeng.Wang@arm.com>
> > Cc: dev@dpdk.org; sshankarnara@marvell.com;
> david.marchand@redhat.com
> > Subject: [PATCH v2 1/1] mldev: split bfloat16 routines to separate
> > files
> >
> > Since bfloat16 intrinsics are not supported on all ARM platforms that
> > support NEON,
> > bfloat16 routines are moved to separate files.
> > This would enable using scalar implementation for bfloat16 on unsupported
> ARM platforms.
> >
> > Bugzilla ID: 1179
> > Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
> >
> > Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > ---
> > Depends-on: patch-120653 ("mldev: remove weak symbols use in type
> > conversions")
> > Depends-on: patch-125035 ("mldev: fix identical code in conditional
> > branches")
> >
> > lib/mldev/meson.build | 11 +-
> > lib/mldev/mldev_utils_neon.c | 142 +------------
> > lib/mldev/mldev_utils_neon_bfloat16.c | 154 ++++++++++++++
> > lib/mldev/mldev_utils_scalar.c | 262 +-----------------------
> > lib/mldev/mldev_utils_scalar.h | 80 ++++++++
> > lib/mldev/mldev_utils_scalar_bfloat16.c | 197 ++++++++++++++++++
> > 6 files changed, 445 insertions(+), 401 deletions(-) create mode
> > 100644 lib/mldev/mldev_utils_neon_bfloat16.c
> > create mode 100644 lib/mldev/mldev_utils_scalar.h create mode 100644
> > lib/mldev/mldev_utils_scalar_bfloat16.c
> >
> > diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build index
> > c9db42257b..5769b0640a
> > 100644
> > --- a/lib/mldev/meson.build
> > +++ b/lib/mldev/meson.build
> > @@ -7,12 +7,21 @@ sources = files(
> > 'mldev_utils.c',
> > )
> >
> > -if dpdk_conf.has('RTE_ARCH_ARM64')
> > +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> > + cc.get_define('__ARM_NEON', args: machine_args) != '')
>
> I found in ACLE document that "__ARM_NEON" is always set to 1 for
> AArch64".
> So this line of check is redundant?
Checking for __ARM_NEON should be enough.
We can drop the dpdk_conf.has('RTE_ARCH_ARM64') check.
I will test the builds and submit a revised patch.
>
> > sources += files('mldev_utils_neon.c') else
> > sources += files('mldev_utils_scalar.c') endif
> >
> > +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> > + cc.get_define('__ARM_NEON', args: machine_args) != '' and
>
> Same here.
>
> > + cc.get_define('__ARM_FEATURE_BF16', args: machine_args) != '')
> > + sources += files('mldev_utils_neon_bfloat16.c')
> > +else
> > + sources += files('mldev_utils_scalar_bfloat16.c')
> > +endif
> > +
> > headers = files(
> > 'rte_mldev.h',
> > )
> <snip>
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
2023-03-15 10:41 ` Srikanth Yalavarthi
@ 2023-03-15 13:39 ` Srikanth Yalavarthi
2023-03-16 2:42 ` Ruifeng Wang
0 siblings, 1 reply; 8+ messages in thread
From: Srikanth Yalavarthi @ 2023-03-15 13:39 UTC (permalink / raw)
To: Ruifeng Wang
Cc: dev, Shivah Shankar Shankar Narayan Rao, david.marchand, nd,
Srikanth Yalavarthi
> -----Original Message-----
> From: Srikanth Yalavarthi <syalavarthi@marvell.com>
> Sent: 15 March 2023 16:12
> To: Ruifeng Wang <Ruifeng.Wang@arm.com>
> Cc: dev@dpdk.org; Shivah Shankar Shankar Narayan Rao
> <sshankarnara@marvell.com>; david.marchand@redhat.com; nd
> <nd@arm.com>; Srikanth Yalavarthi <syalavarthi@marvell.com>; Srikanth
> Yalavarthi <syalavarthi@marvell.com>
> Subject: RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
>
> > -----Original Message-----
> > From: Ruifeng Wang <Ruifeng.Wang@arm.com>
> > Sent: 15 March 2023 15:32
> > To: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > Cc: dev@dpdk.org; Shivah Shankar Shankar Narayan Rao
> > <sshankarnara@marvell.com>; david.marchand@redhat.com; nd
> <nd@arm.com>
> > Subject: [EXT] RE: [PATCH v2 1/1] mldev: split bfloat16 routines to
> > separate files
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > > -----Original Message-----
> > > From: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > > Sent: Monday, March 13, 2023 8:03 PM
> > > To: Srikanth Yalavarthi <syalavarthi@marvell.com>; Ruifeng Wang
> > > <Ruifeng.Wang@arm.com>
> > > Cc: dev@dpdk.org; sshankarnara@marvell.com;
> > david.marchand@redhat.com
> > > Subject: [PATCH v2 1/1] mldev: split bfloat16 routines to separate
> > > files
> > >
> > > Since bfloat16 intrinsics are not supported on all ARM platforms
> > > that support NEON,
> > > bfloat16 routines are moved to separate files.
> > > This would enable using scalar implementation for bfloat16 on
> > > unsupported
> > ARM platforms.
> > >
> > > Bugzilla ID: 1179
> > > Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
> > >
> > > Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > > ---
> > > Depends-on: patch-120653 ("mldev: remove weak symbols use in type
> > > conversions")
> > > Depends-on: patch-125035 ("mldev: fix identical code in conditional
> > > branches")
> > >
> > > lib/mldev/meson.build | 11 +-
> > > lib/mldev/mldev_utils_neon.c | 142 +------------
> > > lib/mldev/mldev_utils_neon_bfloat16.c | 154 ++++++++++++++
> > > lib/mldev/mldev_utils_scalar.c | 262 +-----------------------
> > > lib/mldev/mldev_utils_scalar.h | 80 ++++++++
> > > lib/mldev/mldev_utils_scalar_bfloat16.c | 197 ++++++++++++++++++
> > > 6 files changed, 445 insertions(+), 401 deletions(-) create mode
> > > 100644 lib/mldev/mldev_utils_neon_bfloat16.c
> > > create mode 100644 lib/mldev/mldev_utils_scalar.h create mode
> > > 100644 lib/mldev/mldev_utils_scalar_bfloat16.c
> > >
> > > diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build index
> > > c9db42257b..5769b0640a
> > > 100644
> > > --- a/lib/mldev/meson.build
> > > +++ b/lib/mldev/meson.build
> > > @@ -7,12 +7,21 @@ sources = files(
> > > 'mldev_utils.c',
> > > )
> > >
> > > -if dpdk_conf.has('RTE_ARCH_ARM64')
> > > +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> > > + cc.get_define('__ARM_NEON', args: machine_args) != '')
> >
> > I found in ACLE document that "__ARM_NEON" is always set to 1 for
> > AArch64".
> > So this line of check is redundant?
>
> Checking for __ARM_NEON should be enough.
> We can drop the dpdk_conf.has('RTE_ARCH_ARM64') check.
> I will test the builds and submit a revised patch.
>
Correction. Ideally checking for RTE_ARCH_ARM64 is enough.
But, __ARM_NEON check is required when building with gcc-4.8.x I have tested this on CentOS-7 with GCC-4.8.5
Refer https://bugs.dpdk.org/show_bug.cgi?id=1179
Below errors, are reported with GCC-4.8, when __ARM_NEON check is not used
../lib/mldev/mldev_utils_neon.c:220:2: warning: nested extern declaration of 'vcvtas_u32_f32' [-Wnested-externs]
../lib/mldev/mldev_utils_neon.c: In function '__uint8_to_float32_neon_f32x1':
../lib/mldev/mldev_utils_neon.c:297:2: warning: implicit declaration of function 'vcvts_f32_u32' [-Wimplicit-function-declaration]
*output = scale * vcvts_f32_u32((uint32_t)*input);
^
../lib/mldev/mldev_utils_neon.c:297:2: warning: nested extern declaration of 'vcvts_f32_u32' [-Wnested-externs]
../lib/mldev/mldev_utils_neon.c: At top level:
../lib/mldev/mldev_utils_neon.c:604:51: error: unknown type name 'float16_t'
__float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
^
So, we will need both checks.
> >
> > > sources += files('mldev_utils_neon.c') else
> > > sources += files('mldev_utils_scalar.c') endif
> > >
> > > +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> > > + cc.get_define('__ARM_NEON', args: machine_args) != '' and
> >
> > Same here.
> >
> > > + cc.get_define('__ARM_FEATURE_BF16', args: machine_args) != '')
> > > + sources += files('mldev_utils_neon_bfloat16.c')
> > > +else
> > > + sources += files('mldev_utils_scalar_bfloat16.c')
> > > +endif
> > > +
> > > headers = files(
> > > 'rte_mldev.h',
> > > )
> > <snip>
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
2023-03-15 13:39 ` Srikanth Yalavarthi
@ 2023-03-16 2:42 ` Ruifeng Wang
2023-03-16 12:27 ` David Marchand
0 siblings, 1 reply; 8+ messages in thread
From: Ruifeng Wang @ 2023-03-16 2:42 UTC (permalink / raw)
To: Srikanth Yalavarthi
Cc: dev, Shivah Shankar Shankar Narayan Rao, david.marchand, nd, nd
> -----Original Message-----
> From: Srikanth Yalavarthi <syalavarthi@marvell.com>
> Sent: Wednesday, March 15, 2023 9:39 PM
> To: Ruifeng Wang <Ruifeng.Wang@arm.com>
> Cc: dev@dpdk.org; Shivah Shankar Shankar Narayan Rao <sshankarnara@marvell.com>;
> david.marchand@redhat.com; nd <nd@arm.com>; Srikanth Yalavarthi <syalavarthi@marvell.com>
> Subject: RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
>
> > -----Original Message-----
> > From: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > Sent: 15 March 2023 16:12
> > To: Ruifeng Wang <Ruifeng.Wang@arm.com>
> > Cc: dev@dpdk.org; Shivah Shankar Shankar Narayan Rao
> > <sshankarnara@marvell.com>; david.marchand@redhat.com; nd
> > <nd@arm.com>; Srikanth Yalavarthi <syalavarthi@marvell.com>; Srikanth
> > Yalavarthi <syalavarthi@marvell.com>
> > Subject: RE: [PATCH v2 1/1] mldev: split bfloat16 routines to separate
> > files
> >
> > > -----Original Message-----
> > > From: Ruifeng Wang <Ruifeng.Wang@arm.com>
> > > Sent: 15 March 2023 15:32
> > > To: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > > Cc: dev@dpdk.org; Shivah Shankar Shankar Narayan Rao
> > > <sshankarnara@marvell.com>; david.marchand@redhat.com; nd
> > <nd@arm.com>
> > > Subject: [EXT] RE: [PATCH v2 1/1] mldev: split bfloat16 routines to
> > > separate files
> > >
> > > External Email
> > >
> > > --------------------------------------------------------------------
> > > --
> > > > -----Original Message-----
> > > > From: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > > > Sent: Monday, March 13, 2023 8:03 PM
> > > > To: Srikanth Yalavarthi <syalavarthi@marvell.com>; Ruifeng Wang
> > > > <Ruifeng.Wang@arm.com>
> > > > Cc: dev@dpdk.org; sshankarnara@marvell.com;
> > > david.marchand@redhat.com
> > > > Subject: [PATCH v2 1/1] mldev: split bfloat16 routines to separate
> > > > files
> > > >
> > > > Since bfloat16 intrinsics are not supported on all ARM platforms
> > > > that support NEON,
> > > > bfloat16 routines are moved to separate files.
> > > > This would enable using scalar implementation for bfloat16 on
> > > > unsupported
> > > ARM platforms.
> > > >
> > > > Bugzilla ID: 1179
> > > > Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
> > > >
> > > > Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
> > > > ---
> > > > Depends-on: patch-120653 ("mldev: remove weak symbols use in type
> > > > conversions")
> > > > Depends-on: patch-125035 ("mldev: fix identical code in
> > > > conditional
> > > > branches")
> > > >
> > > > lib/mldev/meson.build | 11 +-
> > > > lib/mldev/mldev_utils_neon.c | 142 +------------
> > > > lib/mldev/mldev_utils_neon_bfloat16.c | 154 ++++++++++++++
> > > > lib/mldev/mldev_utils_scalar.c | 262 +-----------------------
> > > > lib/mldev/mldev_utils_scalar.h | 80 ++++++++
> > > > lib/mldev/mldev_utils_scalar_bfloat16.c | 197 ++++++++++++++++++
> > > > 6 files changed, 445 insertions(+), 401 deletions(-) create mode
> > > > 100644 lib/mldev/mldev_utils_neon_bfloat16.c
> > > > create mode 100644 lib/mldev/mldev_utils_scalar.h create mode
> > > > 100644 lib/mldev/mldev_utils_scalar_bfloat16.c
> > > >
> > > > diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build index
> > > > c9db42257b..5769b0640a
> > > > 100644
> > > > --- a/lib/mldev/meson.build
> > > > +++ b/lib/mldev/meson.build
> > > > @@ -7,12 +7,21 @@ sources = files(
> > > > 'mldev_utils.c',
> > > > )
> > > >
> > > > -if dpdk_conf.has('RTE_ARCH_ARM64')
> > > > +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> > > > + cc.get_define('__ARM_NEON', args: machine_args) != '')
> > >
> > > I found in ACLE document that "__ARM_NEON" is always set to 1 for
> > > AArch64".
> > > So this line of check is redundant?
> >
> > Checking for __ARM_NEON should be enough.
> > We can drop the dpdk_conf.has('RTE_ARCH_ARM64') check.
> > I will test the builds and submit a revised patch.
> >
>
> Correction. Ideally checking for RTE_ARCH_ARM64 is enough.
>
> But, __ARM_NEON check is required when building with gcc-4.8.x I have tested this on
> CentOS-7 with GCC-4.8.5 Refer https://bugs.dpdk.org/show_bug.cgi?id=1179
>
> Below errors, are reported with GCC-4.8, when __ARM_NEON check is not used
Thanks for the clarification. So some NEON intrinsics are not supported by earlier GCC.
Acked-by: Ruifeng Wang <ruifeng.wang@arm.com>
>
> ../lib/mldev/mldev_utils_neon.c:220:2: warning: nested extern declaration of
> 'vcvtas_u32_f32' [-Wnested-externs]
> ../lib/mldev/mldev_utils_neon.c: In function '__uint8_to_float32_neon_f32x1':
> ../lib/mldev/mldev_utils_neon.c:297:2: warning: implicit declaration of function
> 'vcvts_f32_u32' [-Wimplicit-function-declaration]
> *output = scale * vcvts_f32_u32((uint32_t)*input);
> ^
> ../lib/mldev/mldev_utils_neon.c:297:2: warning: nested extern declaration of
> 'vcvts_f32_u32' [-Wnested-externs]
> ../lib/mldev/mldev_utils_neon.c: At top level:
> ../lib/mldev/mldev_utils_neon.c:604:51: error: unknown type name 'float16_t'
> __float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
> ^
>
> So, we will need both checks.
>
>
> > >
> > > > sources += files('mldev_utils_neon.c') else
> > > > sources += files('mldev_utils_scalar.c') endif
> > > >
> > > > +if (dpdk_conf.has('RTE_ARCH_ARM64') and
> > > > + cc.get_define('__ARM_NEON', args: machine_args) != '' and
> > >
> > > Same here.
> > >
> > > > + cc.get_define('__ARM_FEATURE_BF16', args: machine_args) != '')
> > > > + sources += files('mldev_utils_neon_bfloat16.c')
> > > > +else
> > > > + sources += files('mldev_utils_scalar_bfloat16.c')
> > > > +endif
> > > > +
> > > > headers = files(
> > > > 'rte_mldev.h',
> > > > )
> > > <snip>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v2 1/1] mldev: split bfloat16 routines to separate files
2023-03-16 2:42 ` Ruifeng Wang
@ 2023-03-16 12:27 ` David Marchand
0 siblings, 0 replies; 8+ messages in thread
From: David Marchand @ 2023-03-16 12:27 UTC (permalink / raw)
To: Srikanth Yalavarthi
Cc: dev, Shivah Shankar Shankar Narayan Rao, nd, Ruifeng Wang
On Thu, Mar 16, 2023 at 3:43 AM Ruifeng Wang <Ruifeng.Wang@arm.com> wrote:
> > > > > Since bfloat16 intrinsics are not supported on all ARM platforms
> > > > > that support NEON,
> > > > > bfloat16 routines are moved to separate files.
> > > > > This would enable using scalar implementation for bfloat16 on
> > > > > unsupported
> > > > ARM platforms.
> > > > >
> > > > > Bugzilla ID: 1179
> > > > > Fixes: fc54766b1612 ("mldev: add Arm NEON type conversion")
> > > > >
> > > > > Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
> Acked-by: Ruifeng Wang <ruifeng.wang@arm.com>
Applied, thanks.
--
David Marchand
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2023-03-16 12:28 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-13 11:43 [PATCH 1/1] mldev: split bfloat16 routines to separate files Srikanth Yalavarthi
2023-03-13 12:03 ` [PATCH v2 " Srikanth Yalavarthi
2023-03-15 8:23 ` David Marchand
2023-03-15 10:02 ` Ruifeng Wang
2023-03-15 10:41 ` Srikanth Yalavarthi
2023-03-15 13:39 ` Srikanth Yalavarthi
2023-03-16 2:42 ` Ruifeng Wang
2023-03-16 12:27 ` David Marchand
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).