14#ifdef CONFIG_BACKEND_RISCV_V
15 #include "riscv_vector.h"
92 tensor->
shape[0] = shape[0];
94 size_t n_bytes = shape[0] *
sizeof(
float16_t);
97 memcpy(tensor->
data, data, n_bytes);
113 tensor->
shape[0] = shape[0];
114 tensor->
shape[1] = shape[1];
116 size_t n_bytes = shape[0] * shape[1] *
sizeof(
float16_t);
119 memcpy(tensor->
data, data, n_bytes);
147 for (
size_t i = 0; i < n; i += 1) {
163 size_t n = shape[0] * shape[1];
164 for (
size_t i = 0; i < n; i += 1) {
193 for (
size_t i = 0; i < n; i += 1) {
209 size_t n = shape[0] * shape[1];
210 for (
size_t i = 0; i < n; i += 1) {
241 for (
size_t i = 0; i < n; i += 1) {
242 tensor->
data[i] = data;
258 size_t n = shape[0] * shape[1];
259 for (
size_t i = 0; i < n; i += 1) {
260 tensor->
data[i] = data;
288 for (
size_t i = 0; i < n; i += 1) {
304 size_t n = shape[0] * shape[1];
305 for (
size_t i = 0; i < n; i += 1) {
336 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
338 if (i < tensor->shape[0]-1) {
354 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
359 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
361 if (j < tensor->shape[1]-1) {
366 if (i < tensor->shape[0]-1) {
404 size_t n = a->
shape[0];
405 for (
size_t i = 0; i < n; i += 1) {
427 for (
size_t i = 0; i < n; i += 1) {
449 size_t n = x->
shape[0];
452 #ifdef CONFIG_BACKEND_RISCV_ZVFH
453 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-
FLT16_MAX, 1);
456 size_t vl = __riscv_vsetvl_e16m1(n);
457 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
458 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
462 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
465 for (
size_t i = 0; i < n; i += 1) {
466 float val =
as_f32(x_data[i]);
484 #ifdef CONFIG_BACKEND_RISCV_ZVFH
485 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-
FLT16_MAX, 1);
488 size_t vl = __riscv_vsetvl_e16m1(n);
489 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
490 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
494 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
497 for (
size_t i = 0; i < n; i += 1) {
498 float val =
as_f32(x_data[i]);
514 size_t n = x->
shape[0];
517 #ifdef CONFIG_BACKEND_RISCV_ZVFH
518 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(
FLT16_MAX, 1);
521 size_t vl = __riscv_vsetvl_e16m1(n);
522 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
523 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
527 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
530 for (
size_t i = 0; i < n; i += 1) {
531 float val =
as_f32(x_data[i]);
550 #ifdef CONFIG_BACKEND_RISCV_ZVFH
551 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(
FLT16_MAX, 1);
554 size_t vl = __riscv_vsetvl_e16m1(n);
555 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
556 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
560 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
563 for (
size_t i = 0; i < n; i += 1) {
564 float val =
as_f32(x_data[i]);
589 size_t n = y->
shape[0];
594 #ifdef CONFIG_BACKEND_RISCV_ZVFH
596 size_t vl = __riscv_vsetvl_e16m1(n);
597 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
598 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
599 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
600 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
607 for (
size_t i = 0; i < n; i += 1) {
634 #ifdef CONFIG_BACKEND_RISCV_ZVFH
636 size_t vl = __riscv_vsetvl_e16m1(n);
637 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
638 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
639 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
640 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
647 for (
size_t i = 0; i < n; i += 1) {
665 size_t n = y->
shape[0];
669 #ifdef CONFIG_BACKEND_RISCV_ZVFH
671 size_t vl = __riscv_vsetvl_e16m1(n);
672 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
673 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
674 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
680 for (
size_t i = 0; i < n; i += 1) {
702 #ifdef CONFIG_BACKEND_RISCV_ZVFH
704 size_t vl = __riscv_vsetvl_e16m1(n);
705 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
706 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
707 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
713 for (
size_t i = 0; i < n; i += 1) {
739 size_t n = y->
shape[0];
744 #ifdef CONFIG_BACKEND_RISCV_ZVFH
746 size_t vl = __riscv_vsetvl_e16m1(n);
747 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
748 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
749 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
750 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
757 for (
size_t i = 0; i < n; i += 1) {
781 #ifdef CONFIG_BACKEND_RISCV_ZVFH
783 size_t vl = __riscv_vsetvl_e16m1(n);
784 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
785 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
786 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
787 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
794 for (
size_t i = 0; i < n; i += 1) {
812 size_t n = y->
shape[0];
816 #ifdef CONFIG_BACKEND_RISCV_ZVFH
818 size_t vl = __riscv_vsetvl_e16m1(n);
819 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
820 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
821 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
827 for (
size_t i = 0; i < n; i += 1) {
849 #ifdef CONFIG_BACKEND_RISCV_ZVFH
851 size_t vl = __riscv_vsetvl_e16m1(n);
852 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
853 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
854 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
860 for (
size_t i = 0; i < n; i += 1) {
884 size_t n = y->
shape[0];
890 for (
size_t i = 0; i < n; i += 1) {
893 y_data[0] =
as_f16(sum_f32);
906 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform MatMul on tensors of different shapes");
909 const size_t n = x1->
shape[0];
910 const size_t m = x1->
shape[1];
911 const size_t p = x2->
shape[1];
913 for (
size_t i = 0; i < n; i += 1) {
917 #ifdef CONFIG_BACKEND_RISCV_ZVFH
919 size_t vlmax = __riscv_vsetvlmax_e16m1();
920 for (
size_t j = 0; j < p; j += 1) {
921 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
922 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
928 size_t vl = __riscv_vsetvl_e16m1(k);
929 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_row, vl);
930 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_col, p *
sizeof(
float16_t), vl);
931 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
938 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
939 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
941 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
943 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
946 for (
size_t j = 0; j < p; j += 1) {
950 for (
size_t k = 0; k < m; k += 1) {
970 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform Linear on tensors of different shapes");
973 const size_t n = x1->
shape[0];
974 const size_t m = x1->
shape[1];
975 const size_t p = x2->
shape[1];
977 for (
size_t i = 0; i < n; i += 1) {
982 #ifdef CONFIG_BACKEND_RISCV_ZVFH
984 size_t vlmax = __riscv_vsetvlmax_e16m1();
985 for (
size_t j = 0; j < p; j += 1) {
986 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
987 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
993 size_t vl = __riscv_vsetvl_e16m1(k);
994 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_row, vl);
995 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_col, p *
sizeof(
float16_t), vl);
996 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
1003 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1004 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1006 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1008 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum) + c_row[j];
1015 for (
size_t j = 0; j < p; j += 1) {
1019 for (
size_t k = 0; k < m; k += 1) {
1041 nn_assert(x->
shape[1] == weight->
shape[1],
"Cannot perform Linear on tensors of different shapes");
1042 nn_assert(!bias || bias->
shape[0] == weight->
shape[0],
"Cannot perform Linear on tensors of different shapes");
1045 const size_t batch_size = x->
shape[0];
1046 const size_t in_features = x->
shape[1];
1047 const size_t out_features = weight->
shape[0];
1052 for (
size_t i = 0; i < batch_size; i += 1) {
1056 #ifdef CONFIG_BACKEND_RISCV_ZVFH
1058 size_t vlmax = __riscv_vsetvlmax_e16m1();
1060 for (
size_t j = 0; j < out_features; j += 1) {
1061 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
1062 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
1065 size_t n = in_features;
1068 size_t vl = __riscv_vsetvl_e16m1(n);
1069 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
1070 vfloat16m1_t vec_w = __riscv_vle16_v_f16m1(weight_row, vl);
1071 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x, vec_w, vl);
1078 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1079 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1081 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1084 float16_t sum = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
1089 x_data = x_batch_data;
1093 for (
size_t j = 0; j < out_features; j += 1) {
1097 for (
size_t k = 0; k < in_features; k += 1) {
1107 x_batch_data += in_features;
1108 y_batch_data += out_features;
1134 for (
size_t i = 0; i < n; i += 1) {
1135 if (
as_f32(x_data[i]) > 0) {
1136 y_data[i] = x_data[i];
1139 y_data[i] =
as_f16(alpha * (expf(
as_f32(x_data[i])) - 1.f));
1159 #ifdef CONFIG_BACKEND_RISCV_ZVFH
1163 size_t vl = __riscv_vsetvl_e16m1(n);
1164 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
1165 vfloat16m1_t vec_y = __riscv_vfmax_vf_f16m1(vec_x, zero, vl);
1166 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
1172 for (
size_t i = 0; i < n; i += 1) {
1173 float x_val =
as_f32(x_data[i]);
1174 y_data[i] = x_val > 0 ?
as_f16(x_val) : 0;
1190 size_t n = y->
shape[0];
1195 for (
size_t i = 0; i < n; i += 1) {
1196 sum += expf(
as_f32(x_data[i]));
1199 for (
size_t i = 0; i < n; i += 1) {
1219 for (
size_t i = 0; i < n; i += 1) {
1220 float x_val =
as_f32(x_data[i]);
1221 y_data[i] =
as_f16(tanh(x_val));
static float16_t as_f16(float f)
Definition: float16.h:116
static float as_f32(float16_t h)
Definition: float16.h:50
#define FLT16_MAX
Definition: float16.h:27
_Float16 float16_t
Definition: float16.h:33
Half-Precision Floating-Point (fp16) Definitions.
void nn_print_f32(float v, int16_t num_digits)
Definition: nn.h:96
static void nn_assert(int condition, char *message)
Definition: nn.h:59
void nn_min2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Finds the minimum value in a 2D tensor with type F16.
Definition: nn_f16.h:546
void nn_mulscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Multiplies a scalar with a 2D tensor with type F16.
Definition: nn_f16.h:842
Tensor2D_F16 * nn_tensor2d_f16(size_t shape[2], const float16_t *data)
Creates a 2D tensor with type F16.
Definition: nn_f16.h:111
Tensor2D_F16 * nn_zeros2d_f16(size_t shape[2])
Creates a 2D tensor with type F16 and initializes it to 0.
Definition: nn_f16.h:161
void nn_softmax1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x)
Applies the softmax activation function to a 1D tensor with type F16.
Definition: nn_f16.h:1187
void nn_min1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Finds the minimum value in a 1D tensor with type F16.
Definition: nn_f16.h:513
void nn_print_f16(float16_t v, int16_t num_digits)
Prints a half-precision floating-point number.
Definition: nn_f16.h:323
void nn_addscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Adds a scalar to a 1D tensor with type F16.
Definition: nn_f16.h:662
void nn_mul1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:735
void nn_addscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Adds a scalar to a 2D tensor with type F16.
Definition: nn_f16.h:695
void nn_mul2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:772
void nn_tanh2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Applies the tanh activation function to a 2D tensor with type F16.
Definition: nn_f16.h:1212
Tensor0D_F16 * nn_zeros0d_f16()
Creates a 0D tensor with type F16 and initializes it to 0.
Definition: nn_f16.h:131
static uint8_t nn_equal_f16(float16_t golden, float16_t actual, float rel_err)
Checks if two half-precision floating-point numbers are equal within a relative error.
Definition: nn_f16.h:59
Tensor1D_F16 * nn_full1d_f16(size_t shape[1], float16_t data)
Creates a 1D tensor with type F16 and initializes it to a specific value.
Definition: nn_f16.h:238
Tensor2D_F16 * nn_full2d_f16(size_t shape[2], float16_t data)
Creates a 2D tensor with type F16 and initializes it to a specific value.
Definition: nn_f16.h:256
void nn_print_tensor1d_f16(const Tensor1D_F16 *tensor)
Prints the content of a 1D tensor with type F16.
Definition: nn_f16.h:334
Tensor0D_F16 * nn_rand0d_f16()
Creates a 0D tensor with type F16 and initializes it to a random value.
Definition: nn_f16.h:272
void nn_mm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Performs a matrix multiplication of the matrices x1 and x2.
Definition: nn_f16.h:905
void nn_print_tensor2d_f16(const Tensor2D_F16 *tensor)
Prints the content of a 2D tensor with type F16.
Definition: nn_f16.h:352
Tensor2D_F16 * nn_rand2d_f16(size_t shape[2])
Creates a 2D tensor with type F16 and initializes it to a random value.
Definition: nn_f16.h:302
Tensor2D_F16 * nn_ones2d_f16(size_t shape[2])
Creates a 2D tensor with type F16 and initializes it to 1.
Definition: nn_f16.h:207
void nn_addmm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *c, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Performs a matrix multiplication of the matrices x1 and x2.
Definition: nn_f16.h:969
uint8_t nn_equals2d_f16(const Tensor2D_F16 *a, const Tensor2D_F16 *b, float rel_err)
Checks if two 2D tensors with type F16 are equal.
Definition: nn_f16.h:423
uint8_t nn_equals0d_f16(const Tensor0D_F16 *a, const Tensor0D_F16 *b, float rel_err)
Checks if two 0D tensors with type F16 are equal.
Definition: nn_f16.h:387
Tensor1D_F16 * nn_ones1d_f16(size_t shape[1])
Creates a 1D tensor with type F16 and initializes it to 1.
Definition: nn_f16.h:190
Tensor1D_F16 * nn_zeros1d_f16(size_t shape[1])
Creates a 1D tensor with type F16 and initializes it to 0.
Definition: nn_f16.h:144
void nn_add1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:585
void nn_max2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Finds the maximum value in a 2D tensor with type F16.
Definition: nn_f16.h:480
void nn_mulscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Multiplies a scalar with a 1D tensor with type F16.
Definition: nn_f16.h:809
void nn_elu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float alpha)
Applies the ELU activation function to a 2D tensor with type F16.
Definition: nn_f16.h:1127
void nn_dot_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Performs a dot product of two 1D tensors with type F16.
Definition: nn_f16.h:880
void nn_linear_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, const Tensor2D_F16 *weight, const Tensor1D_F16 *bias)
Linear neural network layer.
Definition: nn_f16.h:1040
void nn_add2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:625
void nn_relu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Applies the ReLU activation function to a 2D tensor with type F16.
Definition: nn_f16.h:1152
void nn_max1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Finds the maximum value in a 1D tensor with type F16.
Definition: nn_f16.h:448
uint8_t nn_equals1d_f16(const Tensor1D_F16 *a, const Tensor1D_F16 *b, float rel_err)
Checks if two 1D tensors with type F16 are equal.
Definition: nn_f16.h:401
Tensor1D_F16 * nn_tensor1d_f16(size_t shape[1], const float16_t *data)
Creates a 1D tensor with type F16.
Definition: nn_f16.h:90
Tensor0D_F16 * nn_ones0d_f16()
Creates a 0D tensor with type F16 and initializes it to 1.
Definition: nn_f16.h:177
Tensor0D_F16 * nn_full0d_f16(float16_t data)
Creates a 0D tensor with type F16 and initializes it to a specific value.
Definition: nn_f16.h:224
Tensor0D_F16 * nn_tensor0d_f16(float16_t data)
Creates a 0D tensor with type F16.
Definition: nn_f16.h:75
Tensor1D_F16 * nn_rand1d_f16(size_t shape[1])
Creates a 1D tensor with type F16 and initializes it to a random value.
Definition: nn_f16.h:285
float16_t data
Definition: nn_f16.h:24
A 0D tensor (scalar) with a half-precision floating-point data type.
Definition: nn_f16.h:23
size_t shape[1]
Definition: nn_f16.h:34
float16_t * data
Definition: nn_f16.h:35
A 1D tensor with a half-precision floating-point data type.
Definition: nn_f16.h:33
size_t shape[2]
Definition: nn_f16.h:45
float16_t * data
Definition: nn_f16.h:46
A 2D tensor with a half-precision floating-point data type.
Definition: nn_f16.h:44