14#ifdef CONFIG_BACKEND_RISCV_V
15 #include "riscv_vector.h"
90 tensor->
shape[0] = shape[0];
92 size_t n_bytes = shape[0] *
sizeof(
float16_t);
95 memcpy(tensor->
data, data, n_bytes);
110 tensor->
shape[0] = shape[0];
111 tensor->
shape[1] = shape[1];
113 size_t n_bytes = shape[0] * shape[1] *
sizeof(
float16_t);
116 memcpy(tensor->
data, data, n_bytes);
129 for (
size_t i = 0; i < n; i += 1) {
137 size_t n = shape[0] * shape[1];
138 for (
size_t i = 0; i < n; i += 1) {
152 for (
size_t i = 0; i < n; i += 1) {
160 size_t n = shape[0] * shape[1];
161 for (
size_t i = 0; i < n; i += 1) {
175 for (
size_t i = 0; i < n; i += 1) {
176 tensor->
data[i] = data;
183 size_t n = shape[0] * shape[1];
184 for (
size_t i = 0; i < n; i += 1) {
185 tensor->
data[i] = data;
198 for (
size_t i = 0; i < n; i += 1) {
206 size_t n = shape[0] * shape[1];
207 for (
size_t i = 0; i < n; i += 1) {
240 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
242 if (i < tensor->shape[0]-1) {
258 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
263 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
265 if (j < tensor->shape[1]-1) {
270 if (i < tensor->shape[0]-1) {
308 size_t n = a->
shape[0];
309 for (
size_t i = 0; i < n; i += 1) {
331 for (
size_t i = 0; i < n; i += 1) {
345 size_t n = x->
shape[0];
348 #ifdef CONFIG_BACKEND_RISCV_ZVFH
349 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-
FLT16_MAX, 1);
352 size_t vl = __riscv_vsetvl_e16m1(n);
353 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
354 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
358 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
361 for (
size_t i = 0; i < n; i += 1) {
372 #ifdef CONFIG_BACKEND_RISCV_ZVFH
373 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-
FLT16_MAX, 1);
376 size_t vl = __riscv_vsetvl_e16m1(n);
377 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
378 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
382 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
385 for (
size_t i = 0; i < n; i += 1) {
394 size_t n = x->
shape[0];
397 #ifdef CONFIG_BACKEND_RISCV_ZVFH
398 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(
FLT16_MAX, 1);
401 size_t vl = __riscv_vsetvl_e16m1(n);
402 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
403 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
407 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
410 for (
size_t i = 0; i < n; i += 1) {
422 #ifdef CONFIG_BACKEND_RISCV_ZVFH
423 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(
FLT16_MAX, 1);
426 size_t vl = __riscv_vsetvl_e16m1(n);
427 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
428 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
432 y->
data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
435 for (
size_t i = 0; i < n; i += 1) {
461 size_t n = y->
shape[0];
466 #ifdef CONFIG_BACKEND_RISCV_ZVFH
468 size_t vl = __riscv_vsetvl_e16m1(n);
469 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
470 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
471 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
472 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
479 for (
size_t i = 0; i < n; i += 1) {
507 #ifdef CONFIG_BACKEND_RISCV_ZVFH
509 size_t vl = __riscv_vsetvl_e16m1(n);
510 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
511 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
512 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
513 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
520 for (
size_t i = 0; i < n; i += 1) {
529 size_t n = y->
shape[0];
533 #ifdef CONFIG_BACKEND_RISCV_ZVFH
535 size_t vl = __riscv_vsetvl_e16m1(n);
536 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
537 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
538 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
544 for (
size_t i = 0; i < n; i += 1) {
557 #ifdef CONFIG_BACKEND_RISCV_ZVFH
559 size_t vl = __riscv_vsetvl_e16m1(n);
560 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
561 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
562 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
568 for (
size_t i = 0; i < n; i += 1) {
586 size_t n = y->
shape[0];
591 #ifdef CONFIG_BACKEND_RISCV_ZVFH
593 size_t vl = __riscv_vsetvl_e16m1(n);
594 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
595 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
596 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
597 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
604 for (
size_t i = 0; i < n; i += 1) {
620 #ifdef CONFIG_BACKEND_RISCV_ZVFH
622 size_t vl = __riscv_vsetvl_e16m1(n);
623 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
624 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
625 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
626 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
633 for (
size_t i = 0; i < n; i += 1) {
643 size_t n = y->
shape[0];
647 #ifdef CONFIG_BACKEND_RISCV_ZVFH
649 size_t vl = __riscv_vsetvl_e16m1(n);
650 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
651 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
652 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
658 for (
size_t i = 0; i < n; i += 1) {
673 #ifdef CONFIG_BACKEND_RISCV_ZVFH
675 size_t vl = __riscv_vsetvl_e16m1(n);
676 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
677 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
678 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
684 for (
size_t i = 0; i < n; i += 1) {
699 size_t n = y->
shape[0];
701 for (
size_t i = 0; i < n; i += 1) {
715 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform MatMul on tensors of different shapes");
718 const size_t n = x1->
shape[0];
719 const size_t m = x1->
shape[1];
720 const size_t p = x2->
shape[1];
722 for (
size_t i = 0; i < n; i += 1) {
723 #ifdef CONFIG_BACKEND_RISCV_ZVFH
727 size_t vlmax = __riscv_vsetvlmax_e16m1();
728 for (
size_t j = 0; j < p; j += 1) {
729 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
730 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
737 size_t vl = __riscv_vsetvl_e16m1(k);
738 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_ptr, vl);
739 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_ptr, p *
sizeof(
float16_t), vl);
740 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
747 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
748 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
750 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
752 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
755 for (
size_t j = 0; j < p; j += 1) {
757 for (
size_t k = 0; k < m; k += 1) {
768 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform Linear on tensors of different shapes");
771 const size_t n = x1->
shape[0];
772 const size_t m = x1->
shape[1];
773 const size_t p = x2->
shape[1];
775 for (
size_t i = 0; i < n; i += 1) {
776 #ifdef CONFIG_BACKEND_RISCV_ZVFH
780 size_t vlmax = __riscv_vsetvlmax_e16m1();
781 for (
size_t j = 0; j < p; j += 1) {
782 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
783 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
790 size_t vl = __riscv_vsetvl_e16m1(k);
791 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_ptr, vl);
792 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_ptr, p *
sizeof(
float16_t), vl);
793 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
800 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
801 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
803 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
805 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum) + c->
data[i * p + j];
811 for (
size_t j = 0; j < p; j += 1) {
813 for (
size_t k = 0; k < m; k += 1) {
824 nn_assert(x->
shape[1] == weight->
shape[1],
"Cannot perform Linear on tensors of different shapes");
825 nn_assert(!bias || bias->
shape[0] == weight->
shape[0],
"Cannot perform Linear on tensors of different shapes");
828 const size_t batch_size = x->
shape[0];
829 const size_t in_features = x->
shape[1];
830 const size_t out_features = weight->
shape[0];
835 for (
size_t i = 0; i < batch_size; i += 1) {
836 #ifdef CONFIG_BACKEND_RISCV_ZVFH
840 size_t vlmax = __riscv_vsetvlmax_e16m1();
842 for (
size_t j = 0; j < out_features; j += 1) {
843 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
844 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
847 size_t n = in_features;
850 size_t vl = __riscv_vsetvl_e16m1(n);
851 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
852 vfloat16m1_t vec_w = __riscv_vle16_v_f16m1(weight_row, vl);
853 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x, vec_w, vl);
860 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
861 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
863 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
866 float16_t sum = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
871 x_data = x_batch_data;
874 x_batch_data += in_features;
875 y_batch_data += out_features;
877 for (
size_t j = 0; j < out_features; j += 1) {
879 for (
size_t k = 0; k < in_features; k += 1) {
900 for (
size_t i = 0; i < n; i += 1) {
918 #ifdef CONFIG_BACKEND_RISCV_ZVFH
922 size_t vl = __riscv_vsetvl_e16m1(n);
923 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
924 vfloat16m1_t vec_y = __riscv_vfmax_vf_f16m1(vec_x, zero, vl);
925 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
931 for (
size_t i = 0; i < n; i += 1) {
942 size_t n = y->
shape[0];
944 for (
size_t i = 0; i < n; i += 1) {
948 for (
size_t i = 0; i < n; i += 1) {
959 for (
size_t i = 0; i < n; i += 1) {
static float16_t as_f16(float f)
Definition: float16.h:116
static float as_f32(float16_t h)
Definition: float16.h:50
#define FLT16_MAX
Definition: float16.h:27
_Float16 float16_t
Definition: float16.h:33
Half-Precision Floating-Point (fp16) Definitions.
void nn_print_f32(float v, int16_t num_digits)
Definition: nn.h:91
static void nn_assert(int condition, char *message)
Definition: nn.h:54
void nn_min2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:418
void nn_mulscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Definition: nn_f16.h:666
Tensor2D_F16 * nn_tensor2d_f16(size_t shape[2], const float16_t *data)
Definition: nn_f16.h:108
Tensor2D_F16 * nn_zeros2d_f16(size_t shape[2])
Definition: nn_f16.h:135
void nn_softmax1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x)
Definition: nn_f16.h:939
void nn_min1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Definition: nn_f16.h:393
void nn_print_f16(float16_t v, int16_t num_digits)
Definition: nn_f16.h:225
void nn_addscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Definition: nn_f16.h:526
void nn_mul1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Definition: nn_f16.h:582
void nn_addscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Definition: nn_f16.h:550
void nn_mul2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:611
void nn_tanh2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:955
Tensor0D_F16 * nn_zeros0d_f16()
Definition: nn_f16.h:121
static uint8_t nn_equal_f16(float16_t golden, float16_t actual, float rel_err)
Definition: nn_f16.h:59
Tensor1D_F16 * nn_full1d_f16(size_t shape[1], float16_t data)
Definition: nn_f16.h:172
Tensor2D_F16 * nn_full2d_f16(size_t shape[2], float16_t data)
Definition: nn_f16.h:181
void nn_print_tensor1d_f16(const Tensor1D_F16 *tensor)
Definition: nn_f16.h:238
Tensor0D_F16 * nn_rand0d_f16()
Definition: nn_f16.h:190
void nn_mm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:714
void nn_print_tensor2d_f16(const Tensor2D_F16 *tensor)
Definition: nn_f16.h:256
Tensor2D_F16 * nn_rand2d_f16(size_t shape[2])
Definition: nn_f16.h:204
Tensor2D_F16 * nn_ones2d_f16(size_t shape[2])
Definition: nn_f16.h:158
void nn_addmm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *c, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:767
uint8_t nn_equals2d_f16(const Tensor2D_F16 *a, const Tensor2D_F16 *b, float rel_err)
Definition: nn_f16.h:327
uint8_t nn_equals0d_f16(const Tensor0D_F16 *a, const Tensor0D_F16 *b, float rel_err)
Definition: nn_f16.h:291
Tensor1D_F16 * nn_ones1d_f16(size_t shape[1])
Definition: nn_f16.h:149
Tensor1D_F16 * nn_zeros1d_f16(size_t shape[1])
Definition: nn_f16.h:126
void nn_add1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Definition: nn_f16.h:457
void nn_max2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:368
void nn_mulscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Definition: nn_f16.h:640
void nn_elu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float alpha)
Definition: nn_f16.h:896
void nn_dot_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Definition: nn_f16.h:695
void nn_linear_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, const Tensor2D_F16 *weight, const Tensor1D_F16 *bias)
Definition: nn_f16.h:823
void nn_add2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:498
void nn_relu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:911
void nn_max1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Definition: nn_f16.h:344
uint8_t nn_equals1d_f16(const Tensor1D_F16 *a, const Tensor1D_F16 *b, float rel_err)
Definition: nn_f16.h:305
Tensor1D_F16 * nn_tensor1d_f16(size_t shape[1], const float16_t *data)
Definition: nn_f16.h:88
Tensor0D_F16 * nn_ones0d_f16()
Definition: nn_f16.h:144
Tensor0D_F16 * nn_full0d_f16(float16_t data)
Definition: nn_f16.h:167
Tensor0D_F16 * nn_tensor0d_f16(float16_t data)
Definition: nn_f16.h:74
Tensor1D_F16 * nn_rand1d_f16(size_t shape[1])
Definition: nn_f16.h:195
float16_t data
Definition: nn_f16.h:24
size_t shape[1]
Definition: nn_f16.h:34
float16_t * data
Definition: nn_f16.h:35
size_t shape[2]
Definition: nn_f16.h:45
float16_t * data
Definition: nn_f16.h:46