Baremetal-NN
Baremetal-NN API documentation
Loading...
Searching...
No Matches
nn_f32.h
Go to the documentation of this file.
1
8#ifndef __NN_F32_H
9#define __NN_F32_H
10
11#include <float.h>
12#include <stdint.h>
13
14#ifdef CONFIG_BACKEND_RISCV_V
15 #include "riscv_vector.h"
16#endif
17
23typedef struct {
24 float data;
26
27
33typedef struct {
34 size_t shape[1];
35 float *data;
37
38
44typedef struct {
45 size_t shape[2];
46 float *data;
48
54typedef struct {
55 size_t shape[3];
56 float *data;
58
64typedef struct {
65 size_t shape[4];
66 float *data;
68
69
80static inline uint8_t nn_equal_f32(float golden, float actual, float rel_err) {
81 return (fabs(actual - golden) < rel_err) || (fabs((actual - golden) / actual) < rel_err);
82}
83
84
85/* ======================================================================================================== */
86/* Tensor Creation */
87/* ======================================================================================================== */
88
99 Tensor0D_F32 *tensor = (Tensor0D_F32 *)malloc(sizeof(Tensor0D_F32));
100 tensor->data = data;
101 return tensor;
102}
103
114Tensor1D_F32 *nn_tensor1d_f32(size_t shape[1], const float *data) {
115 Tensor1D_F32 *tensor = (Tensor1D_F32 *)malloc(sizeof(Tensor1D_F32));
116 tensor->shape[0] = shape[0];
117
118 size_t n_bytes = shape[0] * sizeof(float);
119 tensor->data = (float *)malloc(n_bytes);
120 if (data != NULL) {
121 memcpy(tensor->data, data, n_bytes);
122 }
123 return tensor;
124}
125
136Tensor2D_F32 *nn_tensor2d_f32(size_t shape[2], const float *data) {
137 Tensor2D_F32 *tensor = (Tensor2D_F32 *)malloc(sizeof(Tensor2D_F32));
138 tensor->shape[0] = shape[0];
139 tensor->shape[1] = shape[1];
140
141 size_t n_bytes = shape[0] * shape[1] * sizeof(float);
142 tensor->data = (float *)malloc(n_bytes);
143 if (data != NULL) {
144 memcpy(tensor->data, data, n_bytes);
145 }
146 return tensor;
147}
148
159Tensor3D_F32 *nn_tensor3d_f32(size_t shape[3], const float *data) {
160 Tensor3D_F32 *tensor = (Tensor3D_F32 *)malloc(sizeof(Tensor3D_F32));
161 tensor->shape[0] = shape[0];
162 tensor->shape[1] = shape[1];
163 tensor->shape[2] = shape[2];
164
165 size_t n_bytes = shape[0] * shape[1] * shape[2] * sizeof(float);
166 tensor->data = (float *)malloc(n_bytes);
167 if (data != NULL) {
168 memcpy(tensor->data, data, n_bytes);
169 }
170 return tensor;
171}
172
183Tensor4D_F32 *nn_tensor4d_f32(size_t shape[4], const float *data) {
184 Tensor4D_F32 *tensor = (Tensor4D_F32 *)malloc(sizeof(Tensor4D_F32));
185 tensor->shape[0] = shape[0];
186 tensor->shape[1] = shape[1];
187 tensor->shape[2] = shape[2];
188 tensor->shape[3] = shape[3];
189
190 size_t n_bytes = shape[0] * shape[1] * shape[2] * shape[3] * sizeof(float);
191 tensor->data = (float *)malloc(n_bytes);
192 if (data != NULL) {
193 memcpy(tensor->data, data, n_bytes);
194 }
195 return tensor;
196}
197
206Tensor1D_F32 *nn_as_tensor1d_f32(size_t shape[1], float *data) {
207 Tensor1D_F32 *tensor = (Tensor1D_F32 *)malloc(sizeof(Tensor1D_F32));
208 tensor->shape[0] = shape[0];
209 tensor->data = data;
210 return tensor;
211}
212
221Tensor2D_F32 *nn_as_tensor2d_f32(size_t shape[2], float *data) {
222 Tensor2D_F32 *tensor = (Tensor2D_F32 *)malloc(sizeof(Tensor2D_F32));
223 tensor->shape[0] = shape[0];
224 tensor->shape[1] = shape[1];
225 tensor->data = data;
226 return tensor;
227}
228
237Tensor3D_F32 *nn_as_tensor3d_f32(size_t shape[3], float *data) {
238 Tensor3D_F32 *tensor = (Tensor3D_F32 *)malloc(sizeof(Tensor3D_F32));
239 tensor->shape[0] = shape[0];
240 tensor->shape[1] = shape[1];
241 tensor->shape[2] = shape[2];
242 tensor->data = data;
243 return tensor;
244}
245
254Tensor4D_F32 *nn_as_tensor4d_f32(size_t shape[4], float *data) {
255 Tensor4D_F32 *tensor = (Tensor4D_F32 *)malloc(sizeof(Tensor4D_F32));
256 tensor->shape[0] = shape[0];
257 tensor->shape[1] = shape[1];
258 tensor->shape[2] = shape[2];
259 tensor->shape[3] = shape[3];
260 tensor->data = data;
261 return tensor;
262}
263
264
271 Tensor0D_F32 *tensor = nn_tensor0d_f32(0);
272 return tensor;
273}
274
282Tensor1D_F32 *nn_zeros1d_f32(size_t shape[1]) {
283 Tensor1D_F32 *tensor = nn_tensor1d_f32(shape, NULL);
284 size_t n = shape[0];
285 for (size_t i = 0; i < n; i += 1) {
286 tensor->data[i] = 0;
287 }
288 return tensor;
289}
290
298Tensor2D_F32 *nn_zeros2d_f32(size_t shape[2]) {
299 Tensor2D_F32 *tensor = nn_tensor2d_f32(shape, NULL);
300 size_t n = shape[0] * shape[1];
301 for (size_t i = 0; i < n; i += 1) {
302 tensor->data[i] = 0;
303 }
304 return tensor;
305}
306
314Tensor3D_F32 *nn_zeros3d_f32(size_t shape[3]) {
315 Tensor3D_F32 *tensor = nn_tensor3d_f32(shape, NULL);
316 size_t n = shape[0] * shape[1] * shape[2];
317 for (size_t i = 0; i < n; i += 1) {
318 tensor->data[i] = 0;
319 }
320 return tensor;
321}
322
330Tensor4D_F32 *nn_zeros4d_f32(size_t shape[4]) {
331 Tensor4D_F32 *tensor = nn_tensor4d_f32(shape, NULL);
332 size_t n = shape[0] * shape[1] * shape[2] * shape[3];
333 for (size_t i = 0; i < n; i += 1) {
334 tensor->data[i] = 0;
335 }
336 return tensor;
337}
338
345 Tensor0D_F32 *tensor = nn_tensor0d_f32(1);
346 return tensor;
347}
348
356Tensor1D_F32 *nn_ones1d_f32(size_t shape[1]) {
357 Tensor1D_F32 *tensor = nn_tensor1d_f32(shape, NULL);
358 size_t n = shape[0];
359 for (size_t i = 0; i < n; i += 1) {
360 tensor->data[i] = 1;
361 }
362 return tensor;
363}
364
372Tensor2D_F32 *nn_ones2d_f32(size_t shape[2]) {
373 Tensor2D_F32 *tensor = nn_tensor2d_f32(shape, NULL);
374 size_t n = shape[0] * shape[1];
375 for (size_t i = 0; i < n; i += 1) {
376 tensor->data[i] = 1;
377 }
378 return tensor;
379}
380
389 Tensor0D_F32 *tensor = nn_tensor0d_f32(data);
390 return tensor;
391}
392
401Tensor1D_F32 *nn_full1d_f32(size_t shape[1], float data) {
402 Tensor1D_F32 *tensor = nn_tensor1d_f32(shape, NULL);
403 size_t n = shape[0];
404 for (size_t i = 0; i < n; i += 1) {
405 tensor->data[i] = data;
406 }
407 return tensor;
408}
409
418Tensor2D_F32 *nn_full2d_f32(size_t shape[2], float data) {
419 Tensor2D_F32 *tensor = nn_tensor2d_f32(shape, NULL);
420 size_t n = shape[0] * shape[1];
421 for (size_t i = 0; i < n; i += 1) {
422 tensor->data[i] = data;
423 }
424 return tensor;
425}
426
433 Tensor0D_F32 *tensor = nn_tensor0d_f32(rand());
434 return tensor;
435}
436
444Tensor1D_F32 *nn_rand1d_f32(size_t shape[1]) {
445 Tensor1D_F32 *tensor = nn_tensor1d_f32(shape, NULL);
446 size_t n = shape[0];
447 for (size_t i = 0; i < n; i += 1) {
448 tensor->data[i] = rand();
449 }
450 return tensor;
451}
452
460Tensor2D_F32 *nn_rand2d_f32(size_t shape[2]) {
461 Tensor2D_F32 *tensor = nn_tensor2d_f32(shape, NULL);
462 size_t n = shape[0] * shape[1];
463 for (size_t i = 0; i < n; i += 1) {
464 tensor->data[i] = rand();
465 }
466 return tensor;
467}
468
469
470/* ======================================================================================================== */
471/* Tensor Prints */
472/* ======================================================================================================== */
473
482 printf("[");
483 for (size_t i=0; i<tensor->shape[0]; i+=1) {
484 nn_print_f32(*((float *)tensor->data + i), 3);
485 if (i < tensor->shape[0]-1) {
486 printf(" ");
487 }
488 }
489 printf("]\n");
490}
491
500 printf("[");
501 for (size_t i=0; i<tensor->shape[0]; i+=1) {
502 if (i == 0) {
503 printf("[");
504 }
505 else {
506 printf(" [");
507 }
508 for (size_t j=0; j<tensor->shape[1]; j+=1) {
509 nn_print_f32(*((float *)tensor->data + i*tensor->shape[1] + j), 3);
510 if (j < tensor->shape[1]-1) {
511 printf(" ");
512 }
513 }
514 printf(" ]");
515 if (i < tensor->shape[0]-1) {
516 printf("\n");
517 }
518 }
519 printf("]\n");
520}
521
530 printf("[");
531 for (size_t i=0; i<tensor->shape[0]; i+=1) {
532 if (i == 0) {
533 printf("[");
534 }
535 else {
536 printf("\n [");
537 }
538 for (size_t j=0; j<tensor->shape[1]; j+=1) {
539 if (j == 0) {
540 printf("[");
541 }
542 else {
543 printf(" [");
544 }
545 for (size_t k=0; k<tensor->shape[2]; k+=1) {
546 nn_print_f32(*((float *)tensor->data + i*tensor->shape[1]*tensor->shape[2] + j*tensor->shape[2] + k), 3);
547 if (k < tensor->shape[2]-1) {
548 printf(" ");
549 }
550 }
551 printf(" ]");
552 }
553 printf("]");
554 if (i < tensor->shape[0]-1) {
555 printf("\n");
556 }
557 }
558 printf("]\n");
559}
560
569 printf("[");
570 for (size_t i=0; i<tensor->shape[0]; i+=1) {
571 if (i == 0) {
572 printf("[");
573 }
574 else {
575 printf("\n [");
576 }
577 for (size_t j=0; j<tensor->shape[1]; j+=1) {
578 if (j == 0) {
579 printf("[");
580 }
581 else {
582 printf("\n [");
583 }
584 for (size_t k=0; k<tensor->shape[2]; k+=1) {
585 if (k == 0) {
586 printf("[");
587 }
588 else {
589 printf(" [");
590 }
591 for (size_t l=0; l<tensor->shape[3]; l+=1) {
592 nn_print_f32(*((float *)tensor->data + i*tensor->shape[1]*tensor->shape[2]*tensor->shape[3] + j*tensor->shape[2]*tensor->shape[3] + k*tensor->shape[3] + l), 3);
593 if (l < tensor->shape[3]-1) {
594 printf(" ");
595 }
596 }
597 printf(" ]");
598 if (k < tensor->shape[2]-1) {
599 printf("\n");
600 }
601 }
602 printf("]");
603 if (j < tensor->shape[1]-1) {
604 printf("\n");
605 }
606 }
607 printf("]");
608 if (i < tensor->shape[0]-1) {
609 printf("\n");
610 }
611 }
612 printf("]\n");
613}
614
615
616/* ======================================================================================================== */
617/* Comparision */
618/* ======================================================================================================== */
629uint8_t nn_equals0d_f32(const Tensor0D_F32 *a, const Tensor0D_F32 *b, float rel_err) {
630 return nn_equal_f32(a->data, b->data, rel_err);
631}
632
643uint8_t nn_equals1d_f32(const Tensor1D_F32 *a, const Tensor1D_F32 *b, float rel_err) {
644 nn_assert(a->shape[0] == b->shape[0], "Cannot compare tensors of different shapes");
645
646 size_t n = a->shape[0];
647 for (size_t i = 0; i < n; i += 1) {
648 if (!nn_equal_f32(a->data[i], b->data[i], rel_err)) {
649 return 0;
650 }
651 }
652 return 1;
653}
654
665uint8_t nn_equals2d_f32(const Tensor2D_F32 *a, const Tensor2D_F32 *b, float rel_err) {
666 nn_assert(a->shape[0] == b->shape[0] && a->shape[1] == b->shape[1], "Cannot compare tensors of different shapes");
667
668 size_t n = a->shape[0] * a->shape[1];
669 for (size_t i = 0; i < n; i += 1) {
670 if (!nn_equal_f32(a->data[i], b->data[i], rel_err)) {
671 return 0;
672 }
673 }
674 return 1;
675}
676
687uint8_t nn_equals3d_f32(const Tensor3D_F32 *a, const Tensor3D_F32 *b, float rel_err) {
688 nn_assert(a->shape[0] == b->shape[0] && a->shape[1] == b->shape[1] && a->shape[2] == b->shape[2], "Cannot compare tensors of different shapes");
689
690 size_t n = a->shape[0] * a->shape[1] * a->shape[2];
691 for (size_t i = 0; i < n; i += 1) {
692 if (!nn_equal_f32(a->data[i], b->data[i], rel_err)) {
693 return 0;
694 }
695 }
696 return 1;
697}
698
709uint8_t nn_equals4d_f32(const Tensor4D_F32 *a, const Tensor4D_F32 *b, float rel_err) {
710 nn_assert(a->shape[0] == b->shape[0] && a->shape[1] == b->shape[1] && a->shape[2] == b->shape[2] && a->shape[3] == b->shape[3], "Cannot compare tensors of different shapes");
711
712 size_t n = a->shape[0] * a->shape[1] * a->shape[2] * a->shape[3];
713 for (size_t i = 0; i < n; i += 1) {
714 if (!nn_equal_f32(a->data[i], b->data[i], rel_err)) {
715 return 0;
716 }
717 }
718 return 1;
719}
720
721
722/* ======================================================================================================== */
723/* Unary */
724/* ======================================================================================================== */
725
735 size_t n = x->shape[0];
736 float *x_data = x->data;
737
738 #ifdef CONFIG_BACKEND_RISCV_V
739 vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1);
740
741 while (n > 0) {
742 size_t vl = __riscv_vsetvl_e32m1(n);
743 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
744 vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl);
745 x_data += vl;
746 n -= vl;
747 }
748 y->data = __riscv_vfmv_f_s_f32m1_f32(vec_max);
749 #else /* scalar implementation */
750 y->data = -FLT_MAX;
751 for (size_t i = 0; i < n; i += 1) {
752 float val = x_data[i];
753 y->data = val > y->data ? val : y->data;
754 }
755 #endif
756}
757
767 size_t n = x->shape[0] * x->shape[1];
768 float *x_data = x->data;
769
770 #ifdef CONFIG_BACKEND_RISCV_V
771 vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1);
772
773 while (n > 0) {
774 size_t vl = __riscv_vsetvl_e32m1(n);
775 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
776 vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl);
777 x_data += vl;
778 n -= vl;
779 }
780 y->data = __riscv_vfmv_f_s_f32m1_f32(vec_max);
781 #else /* scalar implementation */
782 y->data = -FLT_MAX;
783 for (size_t i = 0; i < n; i += 1) {
784 float val = x_data[i];
785 y->data = val > y->data ? val : y->data;
786 }
787 #endif
788}
789
799 size_t n = x->shape[0];
800 float *x_data = x->data;
801
802 #ifdef CONFIG_BACKEND_RISCV_V
803 vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1);
804
805 while (n > 0) {
806 size_t vl = __riscv_vsetvl_e32m1(n);
807 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
808 vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl);
809 x_data += vl;
810 n -= vl;
811 }
812 y->data = __riscv_vfmv_f_s_f32m1_f32(vec_min);
813 #else /* scalar implementation */
814 y->data = FLT_MAX;
815 for (size_t i = 0; i < n; i += 1) {
816 float val = x_data[i];
817 y->data = val < y->data ? val : y->data;
818 }
819 #endif
820}
821
831 size_t n = x->shape[0] * x->shape[1];
832 float *x_data = x->data;
833
834 #ifdef CONFIG_BACKEND_RISCV_V
835 vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1);
836
837 while (n > 0) {
838 size_t vl = __riscv_vsetvl_e32m1(n);
839 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
840 vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl);
841 x_data += vl;
842 n -= vl;
843 }
844 y->data = __riscv_vfmv_f_s_f32m1_f32(vec_min);
845 #else /* scalar implementation */
846 y->data = FLT_MAX;
847 for (size_t i = 0; i < n; i += 1) {
848 float val = x_data[i];
849 y->data = val < y->data ? val : y->data;
850 }
851 #endif
852}
853
854/* ======================================================================================================== */
855/* Addition */
856/* ======================================================================================================== */
868void nn_add1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2) {
869 nn_assert(x1->shape[0] == x2->shape[0], "Cannot add tensors of different shapes");
870 nn_assert(y->shape[0] == x1->shape[0], "Cannot add tensors of different shapes");
871
872 size_t n = y->shape[0];
873 float *x1_data = x1->data;
874 float *x2_data = x2->data;
875 float *y_data = y->data;
876
877 #ifdef CONFIG_BACKEND_RISCV_V
878 while (n > 0) {
879 size_t vl = __riscv_vsetvl_e32m1(n);
880 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
881 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
882 vfloat32m1_t vec_y = __riscv_vfadd_vv_f32m1(vec_x1, vec_x2, vl);
883 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
884 x1_data += vl;
885 x2_data += vl;
886 y_data += vl;
887 n -= vl;
888 }
889 #else /* scalar implementation */
890 for (size_t i = 0; i < n; i += 1) {
891 y_data[i] = x1_data[i] + x2_data[i];
892 }
893 #endif
894}
895
907void nn_add2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2) {
908 nn_assert(x1->shape[0] == x2->shape[0] && x1->shape[1] == x2->shape[1], "Cannot add tensors of different shapes");
909 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x1->shape[1], "Cannot add tensors of different shapes");
910
911 size_t n = y->shape[0] * y->shape[1];
912 float *x1_data = x1->data;
913 float *x2_data = x2->data;
914 float *y_data = y->data;
915
916 #ifdef CONFIG_BACKEND_RISCV_V
917 while (n > 0) {
918 size_t vl = __riscv_vsetvl_e32m1(n);
919 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
920 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
921 vfloat32m1_t vec_y = __riscv_vfadd_vv_f32m1(vec_x1, vec_x2, vl);
922 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
923 x1_data += vl;
924 x2_data += vl;
925 y_data += vl;
926 n -= vl;
927 }
928 #else /* scalar implementation */
929 for (size_t i = 0; i < n; i += 1) {
930 y_data[i] = x1_data[i] + x2_data[i];
931 }
932 #endif
933}
934
946void nn_addscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar) {
947 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
948
949 size_t n = y->shape[0];
950 float *x_data = x->data;
951 float *y_data = y->data;
952
953 #ifdef CONFIG_BACKEND_RISCV_V
954 while (n > 0) {
955 size_t vl = __riscv_vsetvl_e32m1(n);
956 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
957 vfloat32m1_t vec_y = __riscv_vfadd_vf_f32m1(vec_x, scalar, vl);
958 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
959 x_data += vl;
960 y_data += vl;
961 n -= vl;
962 }
963 #else /* scalar implementation */
964 for (size_t i = 0; i < n; i += 1) {
965 y_data[i] = x_data[i] + scalar;
966 }
967 #endif
968}
969
981void nn_addscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar) {
982 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == x->shape[1], "Cannot add tensors of different shapes");
983
984 size_t n = y->shape[0] * y->shape[1];
985 float *x_data = x->data;
986 float *y_data = y->data;
987
988 #ifdef CONFIG_BACKEND_RISCV_V
989 while (n > 0) {
990 size_t vl = __riscv_vsetvl_e32m1(n);
991 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
992 vfloat32m1_t vec_y = __riscv_vfadd_vf_f32m1(vec_x, scalar, vl);
993 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
994 x_data += vl;
995 y_data += vl;
996 n -= vl;
997 }
998 #else /* scalar implementation */
999 for (size_t i = 0; i < n; i += 1) {
1000 y_data[i] = x_data[i] + scalar;
1001 }
1002 #endif
1003}
1004
1005/* ======================================================================================================== */
1006/* Multiplication */
1007/* ======================================================================================================== */
1008
1020void nn_mul1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2) {
1021 nn_assert(x1->shape[0] == x2->shape[0], "Cannot add tensors of different shapes");
1022 nn_assert(y->shape[0] == x1->shape[0], "Cannot add tensors of different shapes");
1023
1024 size_t n = y->shape[0];
1025 float *x1_data = x1->data;
1026 float *x2_data = x2->data;
1027 float *y_data = y->data;
1028
1029 #ifdef CONFIG_BACKEND_RISCV_V
1030 while (n > 0) {
1031 size_t vl = __riscv_vsetvl_e32m1(n);
1032 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
1033 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
1034 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
1035 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1036 x1_data += vl;
1037 x2_data += vl;
1038 y_data += vl;
1039 n -= vl;
1040 }
1041 #else /* scalar implementation */
1042 for (size_t i = 0; i < n; i += 1) {
1043 y_data[i] = x1_data[i] * x2_data[i];
1044 }
1045 #endif
1046}
1047
1059void nn_mul2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2) {
1060 nn_assert(x1->shape[0] == x2->shape[0] && x1->shape[1] == x2->shape[1], "Cannot add tensors of different shapes");
1061 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x1->shape[1], "Cannot add tensors of different shapes");
1062
1063 size_t n = y->shape[0] * y->shape[1];
1064 float *x1_data = x1->data;
1065 float *x2_data = x2->data;
1066 float *y_data = y->data;
1067
1068 #ifdef CONFIG_BACKEND_RISCV_V
1069 while (n > 0) {
1070 size_t vl = __riscv_vsetvl_e32m1(n);
1071 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
1072 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
1073 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
1074 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1075 x1_data += vl;
1076 x2_data += vl;
1077 y_data += vl;
1078 n -= vl;
1079 }
1080 #else /* scalar implementation */
1081 for (size_t i = 0; i < n; i += 1) {
1082 y_data[i] = x1_data[i] * x2_data[i];
1083 }
1084 #endif
1085}
1086
1098void nn_mulscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar) {
1099 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
1100
1101 size_t n = y->shape[0];
1102 float *x_data = x->data;
1103 float *y_data = y->data;
1104
1105 #ifdef CONFIG_BACKEND_RISCV_V
1106 while (n > 0) {
1107 size_t vl = __riscv_vsetvl_e32m1(n);
1108 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1109 vfloat32m1_t vec_y = __riscv_vfmul_vf_f32m1(vec_x, scalar, vl);
1110 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1111 x_data += vl;
1112 y_data += vl;
1113 n -= vl;
1114 }
1115 #else /* scalar implementation */
1116 for (size_t i = 0; i < n; i += 1) {
1117 y_data[i] = x_data[i] * scalar;
1118 }
1119 #endif
1120}
1121
1133void nn_mulscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar) {
1134 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == x->shape[1], "Cannot add tensors of different shapes");
1135
1136 size_t n = y->shape[0] * y->shape[1];
1137 float *x_data = x->data;
1138 float *y_data = y->data;
1139
1140 #ifdef CONFIG_BACKEND_RISCV_V
1141 while (n > 0) {
1142 size_t vl = __riscv_vsetvl_e32m1(n);
1143 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1144 vfloat32m1_t vec_y = __riscv_vfmul_vf_f32m1(vec_x, scalar, vl);
1145 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1146 x_data += vl;
1147 y_data += vl;
1148 n -= vl;
1149 }
1150 #else /* scalar implementation */
1151 for (size_t i = 0; i < n; i += 1) {
1152 y_data[i] = x_data[i] * scalar;
1153 }
1154 #endif
1155}
1156
1157
1158/* ======================================================================================================== */
1159/* Convolution */
1160/* ======================================================================================================== */
1161
1171 nn_assert(out->shape[0] == in->shape[0], "Cannot convert between tensors of different shapes");
1172 nn_assert(out->shape[1] == in->shape[2], "Cannot convert between tensors of different shapes");
1173 nn_assert(out->shape[2] == in->shape[3], "Cannot convert between tensors of different shapes");
1174 nn_assert(out->shape[3] == in->shape[1], "Cannot convert between tensors of different shapes");
1175
1176 size_t batch_size = in->shape[0];
1177 size_t height = in->shape[2];
1178 size_t width = in->shape[3];
1179 size_t channels = in->shape[1];
1180
1181 for (size_t n = 0; n < batch_size; n += 1) {
1182 for (size_t c = 0; c < channels; c += 1) {
1183 for (size_t h = 0; h < height; h += 1) {
1184 for (size_t w = 0; w < width; w += 1) {
1185 size_t nchw_index = n * channels * height * width + c * height * width + h * width + w;
1186 size_t nhwc_index = n * height * width * channels + h * width * channels + w * channels + c;
1187 ((float *)out->data)[nhwc_index] = ((float *)in->data)[nchw_index];
1188 }
1189 }
1190 }
1191 }
1192}
1193
1194
1204 nn_assert(out->shape[0] == in->shape[0], "Cannot convert between tensors of different shapes");
1205 nn_assert(out->shape[1] == in->shape[3], "Cannot convert between tensors of different shapes");
1206 nn_assert(out->shape[2] == in->shape[1], "Cannot convert between tensors of different shapes");
1207 nn_assert(out->shape[3] == in->shape[2], "Cannot convert between tensors of different shapes");
1208
1209 size_t batch_size = in->shape[0];
1210 size_t height = in->shape[1];
1211 size_t width = in->shape[2];
1212 size_t channels = in->shape[3];
1213
1214 for (size_t n = 0; n < batch_size; n += 1) {
1215 for (size_t c = 0; c < channels; c += 1) {
1216 for (size_t h = 0; h < height; h += 1) {
1217 for (size_t w = 0; w < width; w += 1) {
1218 size_t nhwc_index = n * height * width * channels + h * width * channels + w * channels + c;
1219 size_t nchw_index = n * channels * height * width + c * height * width + h * width + w;
1220 ((float *)out->data)[nchw_index] = ((float *)in->data)[nhwc_index];
1221 }
1222 }
1223 }
1224 }
1225}
1226
1237 Tensor4D_F32 *out, const Tensor4D_F32 *in,
1238 const Tensor4D_F32 *weight, const Tensor1D_F32 *bias,
1239 const size_t *stride, const size_t *padding, const size_t *dilation, size_t groups) {
1240
1241 size_t batch_size = in->shape[0];
1242 size_t in_height = in->shape[1];
1243 size_t in_width = in->shape[2];
1244 size_t in_channels = in->shape[3];
1245
1246 size_t out_height = out->shape[1];
1247 size_t out_width = out->shape[2];
1248 size_t out_channels = out->shape[3];
1249
1250 size_t kernel_height = weight->shape[0];
1251 size_t kernel_width = weight->shape[1];
1252 size_t stride_height = stride[0];
1253 size_t stride_width = stride[1];
1254 size_t padding_height = padding[0];
1255 size_t padding_width = padding[1];
1256 size_t dilation_height = dilation[0];
1257 size_t dilation_width = dilation[1];
1258
1259 nn_assert(out->shape[0] == batch_size, "Cannot add tensors of different shapes");
1260 nn_assert(weight->shape[3] == out_channels, "Cannot add tensors of different shapes");
1261 nn_assert(weight->shape[2] * groups == in_channels, "Cannot add tensors of different shapes");
1262 nn_assert(out_height == (in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height + 1, "Cannot add tensors of different shapes");
1263 nn_assert(out_width == (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1, "Cannot add tensors of different shapes");
1264 nn_assert(groups > 0, "Cannot add tensors of different shapes");
1265 nn_assert(in_channels % groups == 0, "Cannot add tensors of different shapes");
1266 nn_assert(out_channels % groups == 0, "Cannot add tensors of different shapes");
1267
1268
1269 // Initialize output tensor to zeros
1270 memset(out->data, 0, batch_size * out_height * out_width * out_channels * sizeof(float));
1271
1272 #ifdef CONFIG_BACKEND_GEMMINI
1273 if (!bias) {
1274 bias = nn_zeros1d_f32((size_t []){ out_channels });
1275 }
1276
1277 if (groups == 1) {
1278 tiled_conv_auto(
1279 batch_size, in_height, in_width, in_channels,
1280 out_channels, out_height, out_width,
1281 stride_height, dilation_height, 1, padding_height, kernel_height,
1282 0, 0, 0, 0, 0,
1283 in->data,
1284 weight->data,
1285 bias->data,
1286 out->data,
1287 NO_ACTIVATION, ACC_SCALE_IDENTITY,
1288 0, 0, 0,
1289 WS);
1290 }
1291 else if (groups == in_channels) {
1292 assert(weight->shape[2] == 1);
1293
1294 Tensor *in_nchw = NN_tensor(4, (size_t[]){batch_size, in_channels, in_height, in_width}, DTYPE_F32, NULL);
1295 Tensor *out_nchw = NN_tensor(4, (size_t[]){batch_size, out_channels, out_height, out_width}, DTYPE_F32, NULL);
1296
1297 Tensor *weight_1hwc = NN_tensor(4, (size_t[]){1, kernel_height, kernel_width, out_channels}, DTYPE_F32, weight->data);
1298 Tensor *weight_1chw = NN_tensor(4, (size_t[]){1, out_channels, kernel_height, kernel_width}, DTYPE_F32, NULL);
1299
1300 NN_nhwc_to_nchw(in_nchw, in);
1301 NN_nhwc_to_nchw(weight_1chw, weight_1hwc);
1302
1303 for (size_t g = 0; g < groups; g += 1) {
1304 tiled_conv_auto(
1305 batch_size, in_height, in_width, 1,
1306 1, out_height, out_width,
1307 stride_height, dilation_height, 1, padding_height, kernel_height,
1308 0, 0, 0, 0, 0,
1309 ((float *)in_nchw->data) + g * in_height * in_width,
1310 ((float *)weight_1chw->data) + g * kernel_height * kernel_width,
1311 ((float *)bias->data) + g,
1312 ((float *)out_nchw->data) + g * out_height * out_width,
1313 NO_ACTIVATION, ACC_SCALE_IDENTITY,
1314 0, 0, 0,
1315 WS);
1316 }
1317
1318 NN_nchw_to_nhwc(out, out_nchw);
1319
1320 }
1321 else {
1322 printf("[ERROR] Unsupported conv2d operation for groups other than 1 or in_channels\n");
1323 }
1324
1325
1326 #else
1327
1328 if (groups == 1) {
1329 // Standard convolution
1330 for (size_t b = 0; b < batch_size; b += 1) {
1331 for (size_t oc = 0; oc < out_channels; oc += 1) {
1332 for (size_t oh = 0; oh < out_height; oh += 1) {
1333 for (size_t ow = 0; ow < out_width; ow += 1) {
1334 float value = 0.0f;
1335 for (size_t kh = 0; kh < kernel_height; kh += 1) {
1336 for (size_t kw = 0; kw < kernel_width; kw += 1) {
1337 for (size_t ic = 0; ic < in_channels; ic += 1) {
1338 size_t ih = oh * stride_height + kh * dilation_height - padding_height;
1339 size_t iw = ow * stride_width + kw * dilation_width - padding_width;
1340 if (ih < in_height && iw < in_width) {
1341 size_t in_idx = b * in_height * in_width * in_channels
1342 + ih * in_width * in_channels
1343 + iw * in_channels
1344 + ic;
1345 size_t weight_idx = kh * kernel_width * in_channels * out_channels
1346 + kw * in_channels * out_channels
1347 + ic * out_channels
1348 + oc;
1349 value += ((float*)in->data)[in_idx] * ((float*)weight->data)[weight_idx];
1350 }
1351 }
1352 }
1353 }
1354 if (bias != NULL) {
1355 value += ((float*)bias->data)[oc];
1356 }
1357 size_t out_idx = b * out_height * out_width * out_channels
1358 + oh * out_width * out_channels
1359 + ow * out_channels
1360 + oc;
1361 ((float*)out->data)[out_idx] = value;
1362 }
1363 }
1364 }
1365 }
1366 }
1367 else if (groups == in_channels) {
1368 // Depthwise convolution
1369 for (size_t b = 0; b < batch_size; b += 1) {
1370 for (size_t oc = 0; oc < out_channels; oc += 1) {
1371 for (size_t oh = 0; oh < out_height; oh += 1) {
1372 for (size_t ow = 0; ow < out_width; ow += 1) {
1373 float value = 0.0f;
1374 for (size_t kh = 0; kh < kernel_height; kh += 1) {
1375 for (size_t kw = 0; kw < kernel_width; kw += 1) {
1376 size_t ih = oh * stride_height + kh * dilation_height - padding_height;
1377 size_t iw = ow * stride_width + kw * dilation_width - padding_width;
1378 if (ih < in_height && iw < in_width) {
1379 size_t in_idx = b * in_height * in_width * in_channels
1380 + ih * in_width * in_channels
1381 + iw * in_channels
1382 + oc;
1383 size_t weight_idx = kh * kernel_width * in_channels
1384 + kw * in_channels
1385 + oc;
1386 value += ((float *)in->data)[in_idx] * ((float *)weight->data)[weight_idx];
1387 }
1388 }
1389 }
1390 if (bias != NULL) {
1391 value += ((float *)bias->data)[oc];
1392 }
1393 size_t out_idx = b * out_height * out_width * out_channels
1394 + oh * out_width * out_channels
1395 + ow * out_channels
1396 + oc;
1397 ((float *)out->data)[out_idx] = value;
1398 }
1399 }
1400 }
1401 }
1402 }
1403 else {
1404 printf("[ERROR] Unsupported conv2d operation for groups other than 1 or in_channels\n");
1405 }
1406 #endif
1407}
1408
1409
1410/* ======================================================================================================== */
1411/* MatMul */
1412/* ======================================================================================================== */
1413
1425void nn_dot_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2) {
1426 nn_assert(x1->shape[0] == x2->shape[0], "Cannot dot tensors of different shapes");
1427 nn_assert(y->shape[0] == x1->shape[0], "Cannot dot tensors of different shapes");
1428
1429 size_t n = y->shape[0];
1430 float *x1_data = x1->data;
1431 float *x2_data = x2->data;
1432 float *y_data = y->data;
1433
1434 #ifdef CONFIG_BACKEND_RISCV_V
1435 while (n > 0) {
1436 size_t vl = __riscv_vsetvl_e32m1(n);
1437 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
1438 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
1439 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
1440 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1441 x1_data += vl;
1442 x2_data += vl;
1443 y_data += vl;
1444 n -= vl;
1445 }
1446 #else /* scalar implementation */
1447 float sum = 0.0f;
1448 for (size_t i = 0; i < n; i += 1) {
1449 sum += x1_data[i] * x2_data[i];
1450 }
1451 y_data[0] = sum;
1452 #endif
1453}
1454
1466void nn_mv_f32(Tensor1D_F32 *y, const Tensor2D_F32 *x1, const Tensor1D_F32 *x2) {
1467 nn_assert(x1->shape[1] == x2->shape[0], "Cannot perform MV on tensors of different shapes");
1468 nn_assert(y->shape[0] == x1->shape[0], "Cannot perform MV on tensors of different shapes");
1469
1470 const size_t n = x1->shape[0]; // rows in matrix
1471 const size_t m = x1->shape[1]; // columns in matrix
1472 float *x1_data = x1->data;
1473 float *x2_data = x2->data;
1474 float *y_data = y->data;
1475
1476 for (size_t i = 0; i < y->shape[0]; i += 1) {
1477 float sum = 0.0;
1478 for (size_t j = 0; j < m; j += 1) {
1479 sum += x1_data[i * m + j] * x2_data[j];
1480 }
1481 y_data[i] = sum;
1482 }
1483}
1484
1485
1497void nn_mm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2) {
1498 nn_assert(x1->shape[1] == x2->shape[0], "Cannot perform MatMul on tensors of different shapes");
1499 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x2->shape[1], "Cannot perform MatMul on tensors of different shapes");
1500
1501 const size_t n = x1->shape[0];
1502 const size_t m = x1->shape[1];
1503 const size_t p = x2->shape[1];
1504
1505 for (size_t i = 0; i < n; i += 1) {
1506 float *x1_row = x1->data + i * m;
1507 float *y_row = y->data + i * p;
1508
1509 #ifdef CONFIG_BACKEND_RISCV_V
1510
1511 size_t vlmax = __riscv_vsetvlmax_e32m1();
1512 for (size_t j = 0; j < p; j += 1) {
1513 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1514 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1515
1516 float *x2_col = x2->data + j;
1517 size_t k = m;
1518
1519 while (k > 0) {
1520 size_t vl = __riscv_vsetvl_e32m1(k);
1521 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_row, vl);
1522 vfloat32m1_t vec_x2 = __riscv_vlse32_v_f32m1(x2_col, p * sizeof(float), vl);
1523 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x1, vec_x2, vl);
1524
1525 x1_row += vl;
1526 x2_col += vl * p;
1527 k -= vl;
1528 }
1529
1530 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1531 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1532 #else
1533 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1534 #endif
1535 y_row[j] = __riscv_vfmv_f_s_f32m1_f32(vec_sum);
1536 }
1537 #else
1538 for (size_t j = 0; j < p; j += 1) {
1539 float *x2_row = x2->data + j;
1540
1541 float sum = 0.f;
1542 for (size_t k = 0; k < m; k += 1) {
1543 sum += x1_row[k] * x2_row[k * p];
1544 }
1545 y_row[j] = sum;
1546 }
1547 #endif
1548 }
1549}
1550
1563void nn_addmm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *c, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2) {
1564 nn_assert(x1->shape[1] == x2->shape[0], "Cannot perform MatMulAdd on tensors of different shapes");
1565 nn_assert(y->shape[0] == c->shape[0] && y->shape[1] == x2->shape[1], "Cannot perform MatMulAdd on tensors of different shapes");
1566
1567 const size_t n = x1->shape[0];
1568 const size_t m = x1->shape[1];
1569 const size_t p = x2->shape[1];
1570
1571 for (size_t i = 0; i < n; i += 1) {
1572 float *x1_row = x1->data + i * m;
1573 float *c_row = c->data + i * p;
1574 float *y_row = y->data + i * p;
1575
1576 #ifdef CONFIG_BACKEND_RISCV_V
1577
1578 size_t vlmax = __riscv_vsetvlmax_e32m1();
1579 for (size_t j = 0; j < p; j += 1) {
1580 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1581 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1582
1583 float *x2_col = x2->data + j;
1584 size_t k = m;
1585
1586 while (k > 0) {
1587 size_t vl = __riscv_vsetvl_e32m1(k);
1588 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_row, vl);
1589 vfloat32m1_t vec_x2 = __riscv_vlse32_v_f32m1(x2_col, p * sizeof(float), vl);
1590 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x1, vec_x2, vl);
1591
1592 x1_row += vl;
1593 x2_col += vl * p;
1594 k -= vl;
1595 }
1596
1597 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1598 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1599 #else
1600 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1601 #endif
1602 y_row[j] = __riscv_vfmv_f_s_f32m1_f32(vec_sum) + c_row[j];
1603 }
1604
1605 #else
1606 for (size_t j = 0; j < p; j += 1) {
1607 float *x2_col = x2->data + j;
1608
1609 float sum = 0.f;
1610 for (size_t k = 0; k < m; k += 1) {
1611 sum += x1_row[k] * x2_col[k * p];
1612 }
1613 y_row[j] = sum + c_row[j];
1614 }
1615 #endif
1616 x1_row += m;
1617 y_row += p;
1618 }
1619}
1620
1633void nn_linear_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, const Tensor2D_F32 *weight, const Tensor1D_F32 *bias) {
1634 nn_assert(x->shape[1] == weight->shape[1], "Cannot perform Linear on tensors of different shapes");
1635 nn_assert(!bias || bias->shape[0] == weight->shape[0], "Cannot perform Linear on tensors of different shapes");
1636 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == weight->shape[0], "Cannot perform Linear on tensors of different shapes");
1637
1638 const size_t batch_size = x->shape[0];
1639 const size_t in_features = x->shape[1];
1640 const size_t out_features = weight->shape[0];
1641
1642 float *x_batch_data = x->data;
1643 float *y_batch_data = y->data;
1644
1645 for (size_t i = 0; i < batch_size; i += 1) {
1646 float *x_data = x_batch_data;
1647 float *y_data = y_batch_data;
1648
1649 #ifdef CONFIG_BACKEND_RISCV_V
1650 size_t vlmax = __riscv_vsetvlmax_e32m1();
1651
1652 for (size_t j = 0; j < out_features; j += 1) {
1653 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1654 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1655
1656 float *weight_row = weight->data + j * in_features;
1657 size_t n = in_features;
1658
1659 while (n > 0) {
1660 size_t vl = __riscv_vsetvl_e32m1(n);
1661 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1662 vfloat32m1_t vec_w = __riscv_vle32_v_f32m1(weight_row, vl);
1663 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x, vec_w, vl);
1664
1665 x_data += vl;
1666 weight_row += vl;
1667 n -= vl;
1668 }
1669
1670 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1671 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1672 #else
1673 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1674 #endif
1675
1676 float sum = __riscv_vfmv_f_s_f32m1_f32(vec_sum);
1677 if (bias) {
1678 sum += bias->data[j];
1679 }
1680 y_data[j] = sum;
1681 x_data = x_batch_data; // reset x_data pointer for next output feature
1682 }
1683 #else /* scalar implementation */
1684 for (size_t j = 0; j < out_features; j += 1) {
1685 float *weight_row = weight->data + j * in_features;
1686
1687 float sum = 0.f;
1688 for (size_t k = 0; k < in_features; k += 1) {
1689 sum += x_data[k] * weight_row[k];
1690 }
1691 if (bias) {
1692 sum += bias->data[j];
1693 }
1694 y_data[j] = sum;
1695 }
1696 #endif
1697
1698 x_batch_data += in_features;
1699 y_batch_data += out_features;
1700 }
1701}
1702
1703
1704/* ======================================================================================================== */
1705/* Non-linear */
1706/* ======================================================================================================== */
1707
1719void nn_elu2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float alpha) {
1720 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ELU on tensors of different shapes");
1721
1722 const size_t n = y->shape[0] * y->shape[1];
1723 float *x_data = x->data;
1724 float *y_data = y->data;
1725
1726 for (size_t i = 0; i < n; i += 1) {
1727 if (x_data[i] > 0) {
1728 y_data[i] = x_data[i];
1729 }
1730 else {
1731 y_data[i] = alpha * (expf(x_data[i]) - 1.f);
1732 }
1733 }
1734}
1735
1747 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ReLU on tensors of different shapes");
1748
1749 size_t n = y->shape[0] * y->shape[1];
1750 float *x_data = x->data;
1751 float *y_data = y->data;
1752
1753 #ifdef CONFIG_BACKEND_RISCV_V
1754 float zero = 0.0f;
1755
1756 while (n > 0) {
1757 size_t vl = __riscv_vsetvl_e32m1(n);
1758 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1759 vfloat32m1_t vec_y = __riscv_vfmax_vf_f32m1(vec_x, zero, vl);
1760 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1761 x_data += vl;
1762 y_data += vl;
1763 n -= vl;
1764 }
1765 #else /* scalar implementation */
1766 for (size_t i = 0; i < n; i += 1) {
1767 float x_val = x_data[i];
1768 y_data[i] = x_val > 0 ? x_val : 0;
1769 }
1770 #endif
1771}
1772
1784 nn_assert(x->shape[0] == y->shape[0], "Cannot perform SiLU on tensors of different shapes");
1785
1786 const size_t n = y->shape[0];
1787 float *x_data = x->data;
1788 float *y_data = y->data;
1789
1790 for (size_t i = 0; i < n; i++) {
1791 float x_i = x_data[i];
1792 float sigmoid_x = 1.0f / (1.0f + expf(-x_i));
1793 y_data[i] = x_i * sigmoid_x;
1794 }
1795}
1796
1808 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
1809
1810 const size_t n = y->shape[0];
1811 float *x_data = x->data;
1812 float *y_data = y->data;
1813
1814 float sum = 0.0f;
1815 for (size_t i = 0; i < n; i += 1) {
1816 y_data[i] = expf(x_data[i]);
1817 sum += y_data[i];
1818 }
1819 // normalize
1820 for (size_t i = 0; i < n; i += 1) {
1821 y_data[i] /= sum;
1822 }
1823}
1824
1836void nn_softmax2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, size_t dim) {
1837 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == x->shape[1], "Cannot add tensors of different shapes");
1838
1839 float *y_data = y->data;
1840 float *x_data = x->data;
1841
1842 if (dim == 0) {
1843 for (size_t i = 0; i < y->shape[1]; i += 1) {
1844 size_t n = y->shape[0];
1845 size_t m = y->shape[1];
1846 float sum = 0.0f;
1847 for (size_t j = 0; j < n; j += 1) {
1848 sum += expf(x_data[j * m]);
1849 }
1850
1851 for (size_t j = 0; j < n; j += 1) {
1852 y_data[j * m] = expf(x_data[j * m]) / sum;
1853 }
1854
1855 x_data += 1;
1856 y_data += 1;
1857 }
1858 }
1859 else if (dim == 1) {
1860 // HACK: fix batch size
1861 for (size_t i = 0; i < y->shape[0]; i += 1) {
1862 size_t n = y->shape[1];
1863 float sum = 0.0f;
1864 for (size_t j = 0; j < n; j += 1) {
1865 sum += expf(x_data[j]);
1866 }
1867
1868 for (size_t j = 0; j < n; j += 1) {
1869 y_data[j] = expf(x_data[j]) / sum;
1870 }
1871
1872 x_data += n;
1873 y_data += n;
1874 }
1875 }
1876 else {
1877 nn_assert(0, "Invalid dimension for softmax");
1878 }
1879}
1880
1889 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ReLU on tensors of different shapes");
1890
1891 const size_t n = y->shape[0] * y->shape[1];
1892 float *x_data = x->data;
1893 float *y_data = y->data;
1894
1895 for (size_t i = 0; i < n; i += 1) {
1896 float x_val = x_data[i];
1897 y_data[i] = tanhf(x_val);
1898 }
1899}
1900
1901void nn_rms_norm1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, const Tensor1D_F32 *weight, float eps) {
1902 nn_assert(x->shape[0] == y->shape[0], "Cannot perform RMSNorm on tensors of different shapes");
1903
1904 const size_t n = y->shape[0];
1905 float *x_data = x->data;
1906 float *y_data = y->data;
1907 float *w_data = weight->data;
1908
1909 float ss = 0.0f;
1910 for (size_t i = 0; i < n; i += 1) {
1911 ss += x_data[i] * x_data[i];
1912 }
1913 ss /= n;
1914 ss += eps;
1915
1916 // normalize and scale
1917 // y = (x / ss) * w
1918 nn_mulscalar1d_f32(y, x, 1.0f / sqrtf(ss));
1919 nn_mul1d_f32(y, y, weight);
1920}
1921
1922
1923/* ======================================================================================================== */
1924/* Attention */
1925/* ======================================================================================================== */
1926
1948 nn_assert(query->shape[0] == key->shape[0] && query->shape[0] == value->shape[0], "Query, key, and value must have the same batch size");
1949 nn_assert(query->shape[1] == key->shape[1] && query->shape[1] == value->shape[1], "Query, key, and value must have the same number of heads");
1950 nn_assert(key->shape[2] == value->shape[2], "Key and value must have the same sequence length");
1951 nn_assert(query->shape[3] == key->shape[3], "Query and key must have the same embedding dimension");
1952
1953 size_t n = query->shape[0]; // batch size
1954 size_t h = query->shape[1]; // head count
1955 size_t l = query->shape[2]; // target sequence length (query)
1956 size_t s = key->shape[2]; // source sequence length (key/value)
1957 size_t e = query->shape[3]; // embedding dimension
1958 size_t ev = value->shape[3]; // value embedding dimension
1959
1960 // scale_factor = 1 / math.sqrt(query.size(-1))
1961 float scale_factor = 1.0f / sqrt(e);
1962
1963 // Process each batch
1964 for (size_t batch = 0; batch < n; batch += 1) {
1965 // Process each head
1966 for (size_t head = 0; head < h; head += 1) {
1967 // Set up tensor views for the current batch and head
1968 size_t query_head_dims[2] = {l, e};
1969 size_t key_head_dims[2] = {s, e}; // Corrected: should be s, not l
1970 size_t key_transposed_dims[2] = {e, s}; // Transposed key dimensions
1971 size_t attn_weight_head_dims[2] = {l, s};
1972 size_t value_head_dims[2] = {s, ev};
1973 size_t y_head_dims[2] = {l, ev};
1974
1975 // Get the data pointers for the current batch and head
1976 float *query_data = (float *)query->data + (batch * h * l * e) + (head * l * e);
1977 float *key_data = (float *)key->data + (batch * h * s * e) + (head * s * e);
1978 float *value_data = (float *)value->data + (batch * h * s * ev) + (head * s * ev);
1979 float *y_data = (float *)y->data + (batch * h * l * ev) + (head * l * ev);
1980
1981 // Create tensor views
1982 Tensor2D_F32 *query_head = nn_as_tensor2d_f32(query_head_dims, query_data);
1983 Tensor2D_F32 *key_head = nn_as_tensor2d_f32(key_head_dims, key_data);
1984 Tensor2D_F32 *value_head = nn_as_tensor2d_f32(value_head_dims, value_data);
1985 Tensor2D_F32 *y_head = nn_as_tensor2d_f32(y_head_dims, y_data);
1986
1987 // Create and transpose the key matrix manually (key.transpose(-2, -1))
1988 Tensor2D_F32 *key_transposed = nn_tensor2d_f32(key_transposed_dims, NULL);
1989 for (size_t i = 0; i < s; i += 1) {
1990 for (size_t j = 0; j < e; j += 1) {
1991 key_transposed->data[j * s + i] = key_head->data[i * e + j];
1992 }
1993 }
1994
1995 // Calculate attention weights: attn_weight = query @ key.transpose(-2, -1)
1996 Tensor2D_F32 *attn_weight_head = nn_tensor2d_f32(attn_weight_head_dims, NULL);
1997 nn_mm_f32(attn_weight_head, query_head, key_transposed);
1998
1999 // Apply scaling: attn_weight = attn_weight * scale_factor
2000 nn_mulscalar2d_f32(attn_weight_head, attn_weight_head, scale_factor);
2001
2002 // attn_weight = torch.softmax(attn_weight, dim=-1)
2003 nn_softmax2d_f32(attn_weight_head, attn_weight_head, 1);
2004
2005 // (n, h, l, ev) = (n, h, l, s) @ (n, h, s, ev)
2006 // output = attn_weight @ value
2007 nn_mm_f32(y_head, attn_weight_head, value_head);
2008
2009 // Free the temporary tensors we created
2010 free(query_head);
2011 free(key_head);
2012 free(key_transposed->data);
2013 free(key_transposed);
2014 free(attn_weight_head->data);
2015 free(attn_weight_head);
2016 free(value_head);
2017 free(y_head);
2018 }
2019 }
2020}
2021
2022
2023#endif // __NN_F32_H
void nn_print_f32(float v, int16_t num_digits)
Definition: nn.h:96
static void nn_assert(int condition, char *message)
Definition: nn.h:59
void nn_linear_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, const Tensor2D_F32 *weight, const Tensor1D_F32 *bias)
Linear neural network layer.
Definition: nn_f32.h:1633
Tensor3D_F32 * nn_zeros3d_f32(size_t shape[3])
Returns a 3D floating-point data tensor filled with the scalar value 0, with the shape defined by the...
Definition: nn_f32.h:314
void nn_softmax1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x)
Applies the softmax activation function to a 1D floating-point tensor.
Definition: nn_f32.h:1807
Tensor1D_F32 * nn_zeros1d_f32(size_t shape[1])
Returns a 1D floating-point tensor filled with the scalar value 0, with the shape defined by the 1-el...
Definition: nn_f32.h:282
void nn_print_tensor4d_f32(const Tensor4D_F32 *tensor)
Prints the content of a 4D floating-point data tensor.
Definition: nn_f32.h:568
uint8_t nn_equals0d_f32(const Tensor0D_F32 *a, const Tensor0D_F32 *b, float rel_err)
Checks if two 0D floating-point tensors are equal.
Definition: nn_f32.h:629
Tensor0D_F32 * nn_full0d_f32(float data)
Returns a 0D floating-point data tensor (scalar) filled with the scalar value data.
Definition: nn_f32.h:388
void nn_mm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Performs a matrix multiplication of the matrices x1 and x2.
Definition: nn_f32.h:1497
Tensor2D_F32 * nn_rand2d_f32(size_t shape[2])
Returns a 2D floating-point data tensor filled with random floating-point numbers,...
Definition: nn_f32.h:460
void nn_mulscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar)
Multiplies a scalar with a 2D floating-point tensor and stores the result in y.
Definition: nn_f32.h:1133
void nn_mv_f32(Tensor1D_F32 *y, const Tensor2D_F32 *x1, const Tensor1D_F32 *x2)
Performs a matrix-vector multiplication of the matrix x1 and the vector x2.
Definition: nn_f32.h:1466
void nn_rms_norm1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, const Tensor1D_F32 *weight, float eps)
Definition: nn_f32.h:1901
void nn_addscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar)
Adds a scalar to a 2D floating-point tensor and stores the result in y.
Definition: nn_f32.h:981
Tensor2D_F32 * nn_full2d_f32(size_t shape[2], float data)
Returns a 2D floating-point data tensor filled with the scalar value data, with the shape defined by ...
Definition: nn_f32.h:418
uint8_t nn_equals3d_f32(const Tensor3D_F32 *a, const Tensor3D_F32 *b, float rel_err)
Checks if two 3D floating-point tensors are equal.
Definition: nn_f32.h:687
void nn_max2d_f32(Tensor0D_F32 *y, const Tensor2D_F32 *x)
Finds the maximum value in a 2D floating-point tensor.
Definition: nn_f32.h:766
Tensor3D_F32 * nn_tensor3d_f32(size_t shape[3], const float *data)
Creates a 3D floating-point data tensor, with the shape defined by the 3-element array shape.
Definition: nn_f32.h:159
void nn_relu2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x)
Applies the ReLU activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1746
Tensor2D_F32 * nn_tensor2d_f32(size_t shape[2], const float *data)
Creates a 2D floating-point data tensor, with the shape defined by the 2-element array shape.
Definition: nn_f32.h:136
void nn_add2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:907
void nn_scaled_dot_product_attention_f32(Tensor4D_F32 *y, const Tensor4D_F32 *query, const Tensor4D_F32 *key, const Tensor4D_F32 *value)
Computes scaled dot product attention on query, key and value tensors.
Definition: nn_f32.h:1947
void nn_dot_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Computes the dot product of two 1D floating-point tensors.
Definition: nn_f32.h:1425
void nn_addmm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *c, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Performs a matrix multiplication of two 2D floating-point tensors and adds the result to a third tens...
Definition: nn_f32.h:1563
void nn_conv2d_f32(Tensor4D_F32 *out, const Tensor4D_F32 *in, const Tensor4D_F32 *weight, const Tensor1D_F32 *bias, const size_t *stride, const size_t *padding, const size_t *dilation, size_t groups)
Performs a 2D convolution on a 4D tensor.
Definition: nn_f32.h:1236
Tensor1D_F32 * nn_ones1d_f32(size_t shape[1])
Returns a 1D floating-point data tensor filled with the scalar value 1, with the shape defined by the...
Definition: nn_f32.h:356
void nn_min2d_f32(Tensor0D_F32 *y, const Tensor2D_F32 *x)
Finds the minimum value in a 2D floating-point tensor.
Definition: nn_f32.h:830
Tensor0D_F32 * nn_ones0d_f32()
Returns a 0D floating-point data tensor (scalar) filled with the scalar value 1.
Definition: nn_f32.h:344
static uint8_t nn_equal_f32(float golden, float actual, float rel_err)
Checks if two floating-point numbers are equal within a relative error.
Definition: nn_f32.h:80
Tensor4D_F32 * nn_tensor4d_f32(size_t shape[4], const float *data)
Creates a 4D floating-point data tensor, with the shape defined by the 4-element array shape.
Definition: nn_f32.h:183
void nn_elu2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float alpha)
Applies the ELU activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1719
void nn_print_tensor3d_f32(const Tensor3D_F32 *tensor)
Prints the content of a 3D floating-point data tensor.
Definition: nn_f32.h:529
void nn_addscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar)
Adds a scalar to a 1D floating-point tensor and stores the result in y.
Definition: nn_f32.h:946
void nn_tanh2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x)
Applies the tanh activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1888
Tensor2D_F32 * nn_as_tensor2d_f32(size_t shape[2], float *data)
Converts data into a tensor, with the shape defined by the 2-element array shape.
Definition: nn_f32.h:221
Tensor1D_F32 * nn_rand1d_f32(size_t shape[1])
Returns a 1D floating-point data tensor filled with random floating-point numbers,...
Definition: nn_f32.h:444
void nn_mul1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:1020
void nn_softmax2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, size_t dim)
Applies the softmax activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1836
uint8_t nn_equals4d_f32(const Tensor4D_F32 *a, const Tensor4D_F32 *b, float rel_err)
Checks if two 4D floating-point tensors are equal.
Definition: nn_f32.h:709
void nn_mulscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar)
Multiplies a scalar with a 1D floating-point tensor and stores the result in y.
Definition: nn_f32.h:1098
void nn_add1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:868
Tensor2D_F32 * nn_zeros2d_f32(size_t shape[2])
Returns a 2D floating-point data tensor filled with the scalar value 0, with the shape defined by the...
Definition: nn_f32.h:298
uint8_t nn_equals1d_f32(const Tensor1D_F32 *a, const Tensor1D_F32 *b, float rel_err)
Checks if two 1D floating-point tensors are equal.
Definition: nn_f32.h:643
void nn_nchw_to_nhwc_f32(Tensor4D_F32 *out, const Tensor4D_F32 *in)
Converts a 4D tensor from NCHW format to NHWC format.
Definition: nn_f32.h:1170
void nn_print_tensor2d_f32(const Tensor2D_F32 *tensor)
Prints the content of a 2D floating-point data tensor.
Definition: nn_f32.h:499
void nn_silu1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x)
Applies the SiLU (Sigmoid Linear Unit) activation function to a 1D floating-point tensor.
Definition: nn_f32.h:1783
void nn_min1d_f32(Tensor0D_F32 *y, const Tensor1D_F32 *x)
Finds the minimum value in a 1D floating-point tensor.
Definition: nn_f32.h:798
Tensor0D_F32 * nn_rand0d_f32()
Returns a 0D floating-point data tensor (scalar) filled with a random floating-point number.
Definition: nn_f32.h:432
Tensor1D_F32 * nn_as_tensor1d_f32(size_t shape[1], float *data)
Converts data into a tensor, with the shape defined by the 1-element array shape.
Definition: nn_f32.h:206
void nn_mul2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:1059
Tensor3D_F32 * nn_as_tensor3d_f32(size_t shape[3], float *data)
Converts data into a tensor, with the shape defined by the 3-element array shape.
Definition: nn_f32.h:237
Tensor1D_F32 * nn_full1d_f32(size_t shape[1], float data)
Returns a 1D floating-point data tensor filled with the scalar value data, with the shape defined by ...
Definition: nn_f32.h:401
Tensor0D_F32 * nn_zeros0d_f32()
Returns a 0D floating-point tensor (scalar) filled with the scalar value 0.
Definition: nn_f32.h:270
uint8_t nn_equals2d_f32(const Tensor2D_F32 *a, const Tensor2D_F32 *b, float rel_err)
Checks if two 2D floating-point tensors are equal.
Definition: nn_f32.h:665
Tensor0D_F32 * nn_tensor0d_f32(float data)
Creates a 0D floating-point data tensor.
Definition: nn_f32.h:98
Tensor4D_F32 * nn_as_tensor4d_f32(size_t shape[4], float *data)
Converts data into a tensor, with the shape defined by the 4-element array shape.
Definition: nn_f32.h:254
void nn_print_tensor1d_f32(const Tensor1D_F32 *tensor)
Prints the content of a 1D floating-point data tensor.
Definition: nn_f32.h:481
Tensor2D_F32 * nn_ones2d_f32(size_t shape[2])
Returns a 2D floating-point data tensor filled with the scalar value 1, with the shape defined by the...
Definition: nn_f32.h:372
void nn_max1d_f32(Tensor0D_F32 *y, const Tensor1D_F32 *x)
Finds the maximum value in a 1D floating-point tensor.
Definition: nn_f32.h:734
Tensor1D_F32 * nn_tensor1d_f32(size_t shape[1], const float *data)
Creates a 1D floating-point data tensor, with the shape defined by the 1-element array shape.
Definition: nn_f32.h:114
void nn_nhwc_to_nchw_f32(Tensor4D_F32 *out, const Tensor4D_F32 *in)
Converts a 4D tensor from NHWC format to NCHW format.
Definition: nn_f32.h:1203
Tensor4D_F32 * nn_zeros4d_f32(size_t shape[4])
Returns a 4D floating-point data tensor filled with the scalar value 0, with the shape defined by the...
Definition: nn_f32.h:330
float data
Definition: nn_f32.h:24
A 0D tensor (scalar) with a float data type.
Definition: nn_f32.h:23
size_t shape[1]
Definition: nn_f32.h:34
float * data
Definition: nn_f32.h:35
A 1D tensor with a float data type.
Definition: nn_f32.h:33
float * data
Definition: nn_f32.h:46
size_t shape[2]
Definition: nn_f32.h:45
A 2D tensor with a float data type.
Definition: nn_f32.h:44
size_t shape[3]
Definition: nn_f32.h:55
float * data
Definition: nn_f32.h:56
A 3D tensor with a float data type.
Definition: nn_f32.h:54
size_t shape[4]
Definition: nn_f32.h:65
float * data
Definition: nn_f32.h:66
A 4D tensor with a float data type.
Definition: nn_f32.h:64