From: Srikanth Yalavarthi <syalavarthi@marvell.com>
To: Srikanth Yalavarthi <syalavarthi@marvell.com>,
Ruifeng Wang <ruifeng.wang@arm.com>
Cc: <dev@dpdk.org>, <sshankarnara@marvell.com>, <jerinj@marvell.com>,
<aprabhu@marvell.com>, <ptakkar@marvell.com>,
<pshukla@marvell.com>
Subject: [PATCH v6 4/4] mldev: add Arm NEON type conversion routines
Date: Tue, 7 Feb 2023 08:00:08 -0800 [thread overview]
Message-ID: <20230207160008.30182-5-syalavarthi@marvell.com> (raw)
In-Reply-To: <20230207160008.30182-1-syalavarthi@marvell.com>
Added ARM NEON intrinsic based implementations to support conversion
of data types. Support is enabled to handle int8, uint8, int16, uint16,
float16, float32 and bfloat16 types.
Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
v5:
* Moved the code from drivers/common/ml to lib/mldev
* Added rte_ml_io_ prefix to the functions
v2:
* Dropped use of driver routines to call neon functions
* Optimization of neon functions. Reduce the number of intrinsic calls.
lib/mldev/meson.build | 4 +
lib/mldev/mldev_utils_neon.c | 873 +++++++++++++++++++++++++++++++++++
2 files changed, 877 insertions(+)
create mode 100644 lib/mldev/mldev_utils_neon.c
diff --git a/lib/mldev/meson.build b/lib/mldev/meson.build
index fce9c0ebee..05694b0839 100644
--- a/lib/mldev/meson.build
+++ b/lib/mldev/meson.build
@@ -8,6 +8,10 @@ sources = files(
'mldev_utils_scalar.c',
)
+if arch_subdir == 'arm'
+ sources += files('mldev_utils_neon.c')
+endif
+
headers = files(
'rte_mldev.h',
)
diff --git a/lib/mldev/mldev_utils_neon.c b/lib/mldev/mldev_utils_neon.c
new file mode 100644
index 0000000000..32b620db20
--- /dev/null
+++ b/lib/mldev/mldev_utils_neon.c
@@ -0,0 +1,873 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2022 Marvell.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "mldev_utils.h"
+
+#include <arm_neon.h>
+
+/* Description:
+ * This file implements vector versions of Machine Learning utility functions used to convert data
+ * types from higher precision to lower precision and vice-versa. Implementation is based on Arm
+ * Neon intrinsics.
+ */
+
+static inline void
+__float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
+{
+ int16x4_t s16x4_l;
+ int16x4_t s16x4_h;
+ float32x4_t f32x4;
+ int16x8_t s16x8;
+ int32x4_t s32x4;
+ int8x8_t s8x8;
+
+ /* load 4 float32 elements, scale, convert, saturate narrow to int16.
+ * Use round to nearest with ties away rounding mode.
+ */
+ f32x4 = vld1q_f32(input);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ s32x4 = vcvtaq_s32_f32(f32x4);
+ s16x4_l = vqmovn_s32(s32x4);
+
+ /* load next 4 float32 elements, scale, convert, saturate narrow to int16.
+ * Use round to nearest with ties away rounding mode.
+ */
+ f32x4 = vld1q_f32(input + 4);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ s32x4 = vcvtaq_s32_f32(f32x4);
+ s16x4_h = vqmovn_s32(s32x4);
+
+ /* combine lower and higher int16x4_t to int16x8_t */
+ s16x8 = vcombine_s16(s16x4_l, s16x4_h);
+
+ /* narrow to int8_t */
+ s8x8 = vqmovn_s16(s16x8);
+
+ /* store 8 elements */
+ vst1_s8(output, s8x8);
+}
+
+static inline void
+__float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
+{
+ int32_t s32;
+ int16_t s16;
+
+ /* scale and convert, round to nearest with ties away rounding mode */
+ s32 = vcvtas_s32_f32(scale * (*input));
+
+ /* saturate narrow */
+ s16 = vqmovns_s32(s32);
+
+ /* convert to int8_t */
+ *output = vqmovnh_s16(s16);
+}
+
+int
+rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ float *input_buffer;
+ int8_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float *)input;
+ output_buffer = (int8_t *)output;
+ vlen = 2 * sizeof(float) / sizeof(int8_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
+{
+ float32x4_t f32x4;
+ int16x8_t s16x8;
+ int16x4_t s16x4;
+ int32x4_t s32x4;
+ int8x8_t s8x8;
+
+ /* load 8 x int8_t elements */
+ s8x8 = vld1_s8(input);
+
+ /* widen int8_t to int16_t */
+ s16x8 = vmovl_s8(s8x8);
+
+ /* convert lower 4 elements: widen to int32_t, convert to float, scale and store */
+ s16x4 = vget_low_s16(s16x8);
+ s32x4 = vmovl_s16(s16x4);
+ f32x4 = vcvtq_f32_s32(s32x4);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ vst1q_f32(output, f32x4);
+
+ /* convert higher 4 elements: widen to int32_t, convert to float, scale and store */
+ s16x4 = vget_high_s16(s16x8);
+ s32x4 = vmovl_s16(s16x4);
+ f32x4 = vcvtq_f32_s32(s32x4);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ vst1q_f32(output + 4, f32x4);
+}
+
+static inline void
+__int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
+{
+ *output = scale * vcvts_f32_s32((int32_t)*input);
+}
+
+int
+rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ int8_t *input_buffer;
+ float *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (int8_t *)input;
+ output_buffer = (float *)output;
+ vlen = 2 * sizeof(float) / sizeof(int8_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __int8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __int8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
+{
+ uint16x4_t u16x4_l;
+ uint16x4_t u16x4_h;
+ float32x4_t f32x4;
+ uint32x4_t u32x4;
+ uint16x8_t u16x8;
+ uint8x8_t u8x8;
+
+ /* load 4 float elements, scale, convert, saturate narrow to uint16_t.
+ * use round to nearest with ties away rounding mode.
+ */
+ f32x4 = vld1q_f32(input);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ u32x4 = vcvtaq_u32_f32(f32x4);
+ u16x4_l = vqmovn_u32(u32x4);
+
+ /* load next 4 float elements, scale, convert, saturate narrow to uint16_t
+ * use round to nearest with ties away rounding mode.
+ */
+ f32x4 = vld1q_f32(input + 4);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ u32x4 = vcvtaq_u32_f32(f32x4);
+ u16x4_h = vqmovn_u32(u32x4);
+
+ /* combine lower and higher uint16x4_t */
+ u16x8 = vcombine_u16(u16x4_l, u16x4_h);
+
+ /* narrow to uint8x8_t */
+ u8x8 = vqmovn_u16(u16x8);
+
+ /* store 8 elements */
+ vst1_u8(output, u8x8);
+}
+
+static inline void
+__float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
+{
+ uint32_t u32;
+ uint16_t u16;
+
+ /* scale and convert, round to nearest with ties away rounding mode */
+ u32 = vcvtas_u32_f32(scale * (*input));
+
+ /* saturate narrow */
+ u16 = vqmovns_u32(u32);
+
+ /* convert to uint8_t */
+ *output = vqmovnh_u16(u16);
+}
+
+int
+rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ float *input_buffer;
+ uint8_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float *)input;
+ output_buffer = (uint8_t *)output;
+ vlen = 2 * sizeof(float) / sizeof(uint8_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_uint8_neon_u8x8(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_uint8_neon_u8x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
+{
+ float32x4_t f32x4;
+ uint16x8_t u16x8;
+ uint16x4_t u16x4;
+ uint32x4_t u32x4;
+ uint8x8_t u8x8;
+
+ /* load 8 x uint8_t elements */
+ u8x8 = vld1_u8(input);
+
+ /* widen uint8_t to uint16_t */
+ u16x8 = vmovl_u8(u8x8);
+
+ /* convert lower 4 elements: widen to uint32_t, convert to float, scale and store */
+ u16x4 = vget_low_u16(u16x8);
+ u32x4 = vmovl_u16(u16x4);
+ f32x4 = vcvtq_f32_u32(u32x4);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ vst1q_f32(output, f32x4);
+
+ /* convert higher 4 elements: widen to uint32_t, convert to float, scale and store */
+ u16x4 = vget_high_u16(u16x8);
+ u32x4 = vmovl_u16(u16x4);
+ f32x4 = vcvtq_f32_u32(u32x4);
+ f32x4 = vmulq_n_f32(f32x4, scale);
+ vst1q_f32(output + 4, f32x4);
+}
+
+static inline void
+__uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
+{
+ *output = scale * vcvts_f32_u32((uint32_t)*input);
+}
+
+int
+rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ uint8_t *input_buffer;
+ float *output_buffer;
+ uint64_t nb_iterations;
+ uint64_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (uint8_t *)input;
+ output_buffer = (float *)output;
+ vlen = 2 * sizeof(float) / sizeof(uint8_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __uint8_to_float32_neon_f32x8(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __uint8_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
+{
+ float32x4_t f32x4;
+ int16x4_t s16x4;
+ int32x4_t s32x4;
+
+ /* load 4 x float elements */
+ f32x4 = vld1q_f32(input);
+
+ /* scale */
+ f32x4 = vmulq_n_f32(f32x4, scale);
+
+ /* convert to int32x4_t using round to nearest with ties away rounding mode */
+ s32x4 = vcvtaq_s32_f32(f32x4);
+
+ /* saturate narrow to int16x4_t */
+ s16x4 = vqmovn_s32(s32x4);
+
+ /* store 4 elements */
+ vst1_s16(output, s16x4);
+}
+
+static inline void
+__float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
+{
+ int32_t s32;
+
+ /* scale and convert, round to nearest with ties away rounding mode */
+ s32 = vcvtas_s32_f32(scale * (*input));
+
+ /* saturate narrow */
+ *output = vqmovns_s32(s32);
+}
+
+int
+rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ float *input_buffer;
+ int16_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float *)input;
+ output_buffer = (int16_t *)output;
+ vlen = 2 * sizeof(float) / sizeof(int16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_int16_neon_s16x4(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_int16_neon_s16x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
+{
+ float32x4_t f32x4;
+ int16x4_t s16x4;
+ int32x4_t s32x4;
+
+ /* load 4 x int16_t elements */
+ s16x4 = vld1_s16(input);
+
+ /* widen int16_t to int32_t */
+ s32x4 = vmovl_s16(s16x4);
+
+ /* convert int32_t to float */
+ f32x4 = vcvtq_f32_s32(s32x4);
+
+ /* scale */
+ f32x4 = vmulq_n_f32(f32x4, scale);
+
+ /* store float32x4_t */
+ vst1q_f32(output, f32x4);
+}
+
+static inline void
+__int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
+{
+ *output = scale * vcvts_f32_s32((int32_t)*input);
+}
+
+int
+rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ int16_t *input_buffer;
+ float *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (int16_t *)input;
+ output_buffer = (float *)output;
+ vlen = 2 * sizeof(float) / sizeof(int16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __int16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __int16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
+{
+ float32x4_t f32x4;
+ uint16x4_t u16x4;
+ uint32x4_t u32x4;
+
+ /* load 4 float elements */
+ f32x4 = vld1q_f32(input);
+
+ /* scale */
+ f32x4 = vmulq_n_f32(f32x4, scale);
+
+ /* convert using round to nearest with ties to away rounding mode */
+ u32x4 = vcvtaq_u32_f32(f32x4);
+
+ /* saturate narrow */
+ u16x4 = vqmovn_u32(u32x4);
+
+ /* store 4 elements */
+ vst1_u16(output, u16x4);
+}
+
+static inline void
+__float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
+{
+ uint32_t u32;
+
+ /* scale and convert, round to nearest with ties away rounding mode */
+ u32 = vcvtas_u32_f32(scale * (*input));
+
+ /* saturate narrow */
+ *output = vqmovns_u32(u32);
+}
+
+int
+rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ float *input_buffer;
+ uint16_t *output_buffer;
+ uint64_t nb_iterations;
+ uint64_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float *)input;
+ output_buffer = (uint16_t *)output;
+ vlen = 2 * sizeof(float) / sizeof(uint16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_uint16_neon_u16x4(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_uint16_neon_u16x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
+{
+ float32x4_t f32x4;
+ uint16x4_t u16x4;
+ uint32x4_t u32x4;
+
+ /* load 4 x uint16_t elements */
+ u16x4 = vld1_u16(input);
+
+ /* widen uint16_t to uint32_t */
+ u32x4 = vmovl_u16(u16x4);
+
+ /* convert uint32_t to float */
+ f32x4 = vcvtq_f32_u32(u32x4);
+
+ /* scale */
+ f32x4 = vmulq_n_f32(f32x4, scale);
+
+ /* store float32x4_t */
+ vst1q_f32(output, f32x4);
+}
+
+static inline void
+__uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
+{
+ *output = scale * vcvts_f32_u32((uint32_t)*input);
+}
+
+int
+rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, void *output)
+{
+ uint16_t *input_buffer;
+ float *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (uint16_t *)input;
+ output_buffer = (float *)output;
+ vlen = 2 * sizeof(float) / sizeof(uint16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __uint16_to_float32_neon_f32x4(scale, input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __uint16_to_float32_neon_f32x1(scale, input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
+{
+ float32x4_t f32x4;
+ float16x4_t f16x4;
+
+ /* load 4 x float32_t elements */
+ f32x4 = vld1q_f32(input);
+
+ /* convert to float16x4_t */
+ f16x4 = vcvt_f16_f32(f32x4);
+
+ /* store float16x4_t */
+ vst1_f16(output, f16x4);
+}
+
+static inline void
+__float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
+{
+ float32x4_t f32x4;
+ float16x4_t f16x4;
+
+ /* load element to 4 lanes */
+ f32x4 = vld1q_dup_f32(input);
+
+ /* convert float32_t to float16_t */
+ f16x4 = vcvt_f16_f32(f32x4);
+
+ /* store lane 0 / 1 element */
+ vst1_lane_f16(output, f16x4, 0);
+}
+
+int
+rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
+{
+ float32_t *input_buffer;
+ float16_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float32_t *)input;
+ output_buffer = (float16_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_float16_neon_f16x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_float16_neon_f16x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
+{
+ float16x4_t f16x4;
+ float32x4_t f32x4;
+
+ /* load 4 x float16_t elements */
+ f16x4 = vld1_f16(input);
+
+ /* convert float16x4_t to float32x4_t */
+ f32x4 = vcvt_f32_f16(f16x4);
+
+ /* store float32x4_t */
+ vst1q_f32(output, f32x4);
+}
+
+static inline void
+__float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
+{
+ float16x4_t f16x4;
+ float32x4_t f32x4;
+
+ /* load element to 4 lanes */
+ f16x4 = vld1_dup_f16(input);
+
+ /* convert float16_t to float32_t */
+ f32x4 = vcvt_f32_f16(f16x4);
+
+ /* store 1 element */
+ vst1q_lane_f32(output, f32x4, 0);
+}
+
+int
+rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
+{
+ float16_t *input_buffer;
+ float32_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float16_t *)input;
+ output_buffer = (float32_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float16_to_float32_neon_f32x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float16_to_float32_neon_f32x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+#ifdef __ARM_FEATURE_BF16
+
+static inline void
+__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)
+{
+ float32x4_t f32x4;
+ bfloat16x4_t bf16x4;
+
+ /* load 4 x float32_t elements */
+ f32x4 = vld1q_f32(input);
+
+ /* convert float32x4_t to bfloat16x4_t */
+ bf16x4 = vcvt_bf16_f32(f32x4);
+
+ /* store bfloat16x4_t */
+ vst1_bf16(output, bf16x4);
+}
+
+static inline void
+__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)
+{
+ float32x4_t f32x4;
+ bfloat16x4_t bf16x4;
+
+ /* load element to 4 lanes */
+ f32x4 = vld1q_dup_f32(input);
+
+ /* convert float32_t to bfloat16_t */
+ bf16x4 = vcvt_bf16_f32(f32x4);
+
+ /* store lane 0 / 1 element */
+ vst1_lane_bf16(output, bf16x4, 0);
+}
+
+int
+rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
+{
+ float32_t *input_buffer;
+ bfloat16_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (float32_t *)input;
+ output_buffer = (bfloat16_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __float32_to_bfloat16_neon_f16x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __float32_to_bfloat16_neon_f16x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+static inline void
+__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)
+{
+ bfloat16x4_t bf16x4;
+ float32x4_t f32x4;
+
+ /* load 4 x bfloat16_t elements */
+ bf16x4 = vld1_bf16(input);
+
+ /* convert bfloat16x4_t to float32x4_t */
+ f32x4 = vcvt_f32_bf16(bf16x4);
+
+ /* store float32x4_t */
+ vst1q_f32(output, f32x4);
+}
+
+static inline void
+__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)
+{
+ bfloat16x4_t bf16x4;
+ float32x4_t f32x4;
+
+ /* load element to 4 lanes */
+ bf16x4 = vld1_dup_bf16(input);
+
+ /* convert bfloat16_t to float32_t */
+ f32x4 = vcvt_f32_bf16(bf16x4);
+
+ /* store lane 0 / 1 element */
+ vst1q_lane_f32(output, f32x4, 0);
+}
+
+int
+rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
+{
+ bfloat16_t *input_buffer;
+ float32_t *output_buffer;
+ uint64_t nb_iterations;
+ uint32_t vlen;
+ uint64_t i;
+
+ if ((nb_elements == 0) || (input == NULL) || (output == NULL))
+ return -EINVAL;
+
+ input_buffer = (bfloat16_t *)input;
+ output_buffer = (float32_t *)output;
+ vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
+ nb_iterations = nb_elements / vlen;
+
+ /* convert vlen elements in each iteration */
+ for (i = 0; i < nb_iterations; i++) {
+ __bfloat16_to_float32_neon_f32x4(input_buffer, output_buffer);
+ input_buffer += vlen;
+ output_buffer += vlen;
+ }
+
+ /* convert leftover elements */
+ i = i * vlen;
+ for (; i < nb_elements; i++) {
+ __bfloat16_to_float32_neon_f32x1(input_buffer, output_buffer);
+ input_buffer++;
+ output_buffer++;
+ }
+
+ return 0;
+}
+
+#endif /* __ARM_FEATURE_BF16 */
--
2.17.1
next prev parent reply other threads:[~2023-02-07 16:00 UTC|newest]
Thread overview: 59+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-12-08 19:35 [PATCH v1 0/4] implementation of ML common code Srikanth Yalavarthi
2022-12-08 19:35 ` [PATCH v1 1/4] common/ml: add initial files for " Srikanth Yalavarthi
2022-12-08 19:35 ` [PATCH v1 2/4] common/ml: add data type conversion routines Srikanth Yalavarthi
2022-12-08 19:35 ` [PATCH v1 3/4] common/ml: add generic type conversion functions Srikanth Yalavarthi
2022-12-08 19:35 ` [PATCH v1 4/4] common/ml: add Arm NEON type conversion routines Srikanth Yalavarthi
2022-12-12 7:16 ` Ruifeng Wang
2022-12-12 17:25 ` Srikanth Yalavarthi
2022-12-12 17:21 ` [PATCH v1 0/4] implementation of ML common code Srikanth Yalavarthi
2022-12-12 17:21 ` [PATCH v2 1/4] common/ml: add initial files for " Srikanth Yalavarthi
2022-12-12 17:21 ` [PATCH v2 2/4] common/ml: add common utility functions Srikanth Yalavarthi
2022-12-12 17:21 ` [PATCH v2 3/4] common/ml: add scalar type conversion functions Srikanth Yalavarthi
2022-12-12 17:21 ` [PATCH v2 4/4] common/ml: add Arm NEON type conversion routines Srikanth Yalavarthi
2022-12-13 9:04 ` Ruifeng Wang
2022-12-20 17:52 ` [PATCH v3 0/4] implementation of ML common code Srikanth Yalavarthi
2022-12-20 17:52 ` [PATCH v3 1/4] common/ml: add initial files for " Srikanth Yalavarthi
2022-12-20 19:04 ` Stephen Hemminger
2022-12-20 19:19 ` [EXT] " Srikanth Yalavarthi
2022-12-20 17:52 ` [PATCH v3 2/4] common/ml: add common utility functions Srikanth Yalavarthi
2022-12-20 17:52 ` [PATCH v3 3/4] common/ml: add scalar type conversion functions Srikanth Yalavarthi
2022-12-20 17:52 ` [PATCH v3 4/4] common/ml: add Arm NEON type conversion routines Srikanth Yalavarthi
2022-12-21 3:08 ` Ruifeng Wang
2022-12-20 19:06 ` [PATCH v3 0/4] implementation of ML common code Stephen Hemminger
2022-12-20 19:17 ` [EXT] " Srikanth Yalavarthi
2023-01-25 13:18 ` Thomas Monjalon
2023-01-25 13:25 ` [EXT] " Srikanth Yalavarthi
2023-01-25 13:55 ` Thomas Monjalon
2023-01-25 14:59 ` Srikanth Yalavarthi
2023-01-26 10:57 ` Thomas Monjalon
2023-01-27 6:40 ` Jerin Jacob
2023-01-27 8:50 ` Thomas Monjalon
2023-01-27 9:02 ` Jerin Jacob
2023-01-27 9:26 ` Thomas Monjalon
2023-01-27 10:28 ` Jerin Jacob
2023-01-31 13:44 ` Srikanth Yalavarthi
2023-02-01 9:15 ` Srikanth Yalavarthi
2023-02-01 9:04 ` [PATCH v4 0/4] Implementation " Srikanth Yalavarthi
2023-02-01 9:04 ` [PATCH v4 1/4] mldev: add headers for internal ML functions Srikanth Yalavarthi
2023-02-01 13:54 ` Anup Prabhu
2023-02-01 15:28 ` Thomas Monjalon
2023-02-01 9:04 ` [PATCH v4 2/4] mldev: implement ML IO type handling functions Srikanth Yalavarthi
2023-02-01 13:53 ` Anup Prabhu
2023-02-01 14:01 ` Anup Prabhu
2023-02-01 14:15 ` Anup Prabhu
2023-02-01 14:26 ` Anup Prabhu
2023-02-01 9:04 ` [PATCH v4 3/4] mldev: add scalar type conversion functions Srikanth Yalavarthi
2023-02-01 9:04 ` [PATCH v4 4/4] mldev: add Arm NEON type conversion routines Srikanth Yalavarthi
2023-02-01 9:12 ` [PATCH v5 0/4] Implementation of ML common code Srikanth Yalavarthi
2023-02-01 9:12 ` [PATCH v5 1/4] mldev: add headers for internal ML functions Srikanth Yalavarthi
2023-02-01 9:12 ` [PATCH v5 2/4] mldev: implement ML IO type handling functions Srikanth Yalavarthi
2023-02-02 4:20 ` Anup Prabhu
2023-02-01 9:12 ` [PATCH v5 3/4] mldev: add scalar type conversion functions Srikanth Yalavarthi
2023-02-01 9:12 ` [PATCH v5 4/4] mldev: add Arm NEON type conversion routines Srikanth Yalavarthi
2023-02-07 16:00 ` [PATCH v6 0/4] Implementation of ML common code Srikanth Yalavarthi
2023-02-07 16:00 ` [PATCH v6 1/4] mldev: add headers for internal ML functions Srikanth Yalavarthi
2023-03-09 20:44 ` Thomas Monjalon
2023-02-07 16:00 ` [PATCH v6 2/4] mldev: implement ML IO type handling functions Srikanth Yalavarthi
2023-02-07 16:00 ` [PATCH v6 3/4] mldev: add scalar type conversion functions Srikanth Yalavarthi
2023-02-07 16:00 ` Srikanth Yalavarthi [this message]
2023-03-09 21:37 ` [PATCH v6 0/4] Implementation of ML common code Thomas Monjalon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230207160008.30182-5-syalavarthi@marvell.com \
--to=syalavarthi@marvell.com \
--cc=aprabhu@marvell.com \
--cc=dev@dpdk.org \
--cc=jerinj@marvell.com \
--cc=pshukla@marvell.com \
--cc=ptakkar@marvell.com \
--cc=ruifeng.wang@arm.com \
--cc=sshankarnara@marvell.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).