Baremetal-NN
Baremetal-NN API documentation
Loading...
Searching...
No Matches
nn_f16.h
Go to the documentation of this file.
1
8#ifndef __NN_F16_H
9#define __NN_F16_H
10
11#include "float16.h"
12
13
14#ifdef CONFIG_BACKEND_RISCV_V
15 #include "riscv_vector.h"
16#endif
17
23typedef struct {
26
27
33typedef struct {
34 size_t shape[1];
37
38
44typedef struct {
45 size_t shape[2];
48
59static inline uint8_t nn_equal_f16(float16_t golden, float16_t actual, float rel_err) {
60 return (fabs(as_f32(actual) - as_f32(golden)) < rel_err) || (fabs((as_f32(actual) - as_f32(golden)) / as_f32(actual)) < rel_err);
61}
62
63
64/* ======================================================================================================== */
65/* Tensor Creation */
66/* ======================================================================================================== */
76 Tensor0D_F16 *tensor = (Tensor0D_F16 *)malloc(sizeof(Tensor0D_F16));
77 tensor->data = data;
78 return tensor;
79}
80
90Tensor1D_F16 *nn_tensor1d_f16(size_t shape[1], const float16_t *data) {
91 Tensor1D_F16 *tensor = (Tensor1D_F16 *)malloc(sizeof(Tensor1D_F16));
92 tensor->shape[0] = shape[0];
93
94 size_t n_bytes = shape[0] * sizeof(float16_t);
95 tensor->data = (float16_t *)malloc(n_bytes);
96 if (data != NULL) {
97 memcpy(tensor->data, data, n_bytes);
98 }
99 return tensor;
100}
101
111Tensor2D_F16 *nn_tensor2d_f16(size_t shape[2], const float16_t *data) {
112 Tensor2D_F16 *tensor = (Tensor2D_F16 *)malloc(sizeof(Tensor2D_F16));
113 tensor->shape[0] = shape[0];
114 tensor->shape[1] = shape[1];
115
116 size_t n_bytes = shape[0] * shape[1] * sizeof(float16_t);
117 tensor->data = (float16_t *)malloc(n_bytes);
118 if (data != NULL) {
119 memcpy(tensor->data, data, n_bytes);
120 }
121 return tensor;
122}
123
132 Tensor0D_F16 *tensor = nn_tensor0d_f16(0);
133 return tensor;
134}
135
144Tensor1D_F16 *nn_zeros1d_f16(size_t shape[1]) {
145 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
146 size_t n = shape[0];
147 for (size_t i = 0; i < n; i += 1) {
148 tensor->data[i] = 0;
149 }
150 return tensor;
151}
152
161Tensor2D_F16 *nn_zeros2d_f16(size_t shape[2]) {
162 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
163 size_t n = shape[0] * shape[1];
164 for (size_t i = 0; i < n; i += 1) {
165 tensor->data[i] = 0;
166 }
167 return tensor;
168}
169
178 Tensor0D_F16 *tensor = nn_tensor0d_f16(1);
179 return tensor;
180}
181
190Tensor1D_F16 *nn_ones1d_f16(size_t shape[1]) {
191 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
192 size_t n = shape[0];
193 for (size_t i = 0; i < n; i += 1) {
194 tensor->data[i] = 1;
195 }
196 return tensor;
197}
198
207Tensor2D_F16 *nn_ones2d_f16(size_t shape[2]) {
208 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
209 size_t n = shape[0] * shape[1];
210 for (size_t i = 0; i < n; i += 1) {
211 tensor->data[i] = 1;
212 }
213 return tensor;
214}
215
225 Tensor0D_F16 *tensor = nn_tensor0d_f16(data);
226 return tensor;
227}
228
238Tensor1D_F16 *nn_full1d_f16(size_t shape[1], float16_t data) {
239 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
240 size_t n = shape[0];
241 for (size_t i = 0; i < n; i += 1) {
242 tensor->data[i] = data;
243 }
244 return tensor;
245}
246
256Tensor2D_F16 *nn_full2d_f16(size_t shape[2], float16_t data) {
257 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
258 size_t n = shape[0] * shape[1];
259 for (size_t i = 0; i < n; i += 1) {
260 tensor->data[i] = data;
261 }
262 return tensor;
263}
264
273 Tensor0D_F16 *tensor = nn_tensor0d_f16(as_f16(rand()));
274 return tensor;
275}
276
285Tensor1D_F16 *nn_rand1d_f16(size_t shape[1]) {
286 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
287 size_t n = shape[0];
288 for (size_t i = 0; i < n; i += 1) {
289 tensor->data[i] = as_f16(rand());
290 }
291 return tensor;
292}
293
302Tensor2D_F16 *nn_rand2d_f16(size_t shape[2]) {
303 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
304 size_t n = shape[0] * shape[1];
305 for (size_t i = 0; i < n; i += 1) {
306 tensor->data[i] = as_f16(rand());
307 }
308 return tensor;
309}
310
311
312/* ======================================================================================================== */
313/* Tensor Prints */
314/* ======================================================================================================== */
323void nn_print_f16(float16_t v, int16_t num_digits) {
324 nn_print_f32(as_f32(v), num_digits);
325}
326
335 printf("[");
336 for (size_t i=0; i<tensor->shape[0]; i+=1) {
337 nn_print_f16(*((float16_t *)tensor->data + i), 3);
338 if (i < tensor->shape[0]-1) {
339 printf(" ");
340 }
341 }
342 printf("]\n");
343}
344
353 printf("[");
354 for (size_t i=0; i<tensor->shape[0]; i+=1) {
355 if (i != 0) {
356 printf(" ");
357 }
358 printf("[");
359 for (size_t j=0; j<tensor->shape[1]; j+=1) {
360 nn_print_f16(*((float16_t *)tensor->data + i*tensor->shape[1] + j), 3);
361 if (j < tensor->shape[1]-1) {
362 printf(" ");
363 }
364 }
365 printf("]");
366 if (i < tensor->shape[0]-1) {
367 printf("\n");
368 }
369 }
370 printf("]\n");
371}
372
373
374/* ======================================================================================================== */
375/* Comparision */
376/* ======================================================================================================== */
387uint8_t nn_equals0d_f16(const Tensor0D_F16 *a, const Tensor0D_F16 *b, float rel_err) {
388 return nn_equal_f16(a->data, b->data, rel_err);
389}
390
401uint8_t nn_equals1d_f16(const Tensor1D_F16 *a, const Tensor1D_F16 *b, float rel_err) {
402 nn_assert(a->shape[0] == b->shape[0], "Cannot compare tensors of different shapes");
403
404 size_t n = a->shape[0];
405 for (size_t i = 0; i < n; i += 1) {
406 if (!nn_equal_f16(a->data[i], b->data[i], rel_err)) {
407 return 0;
408 }
409 }
410 return 1;
411}
412
423uint8_t nn_equals2d_f16(const Tensor2D_F16 *a, const Tensor2D_F16 *b, float rel_err) {
424 nn_assert(a->shape[0] == b->shape[0] && a->shape[1] == b->shape[1], "Cannot compare tensors of different shapes");
425
426 size_t n = a->shape[0] * a->shape[1];
427 for (size_t i = 0; i < n; i += 1) {
428 if (!nn_equal_f16(a->data[i], b->data[i], rel_err)) {
429 return 0;
430 }
431 }
432 return 1;
433}
434
435
436
437/* ======================================================================================================== */
438/* Unary */
439/* ======================================================================================================== */
449 size_t n = x->shape[0];
450 float16_t *x_data = x->data;
451
452 #ifdef CONFIG_BACKEND_RISCV_ZVFH
453 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT16_MAX, 1);
454
455 while (n > 0) {
456 size_t vl = __riscv_vsetvl_e16m1(n);
457 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
458 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
459 x_data += vl;
460 n -= vl;
461 }
462 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
463 #else // scalar implementation
464 y->data = -FLT16_MAX;
465 for (size_t i = 0; i < n; i += 1) {
466 float val = as_f32(x_data[i]);
467 y->data = val > y->data ? val : y->data;
468 }
469 #endif
470}
471
481 size_t n = x->shape[0] * x->shape[1];
482 float16_t *x_data = x->data;
483
484 #ifdef CONFIG_BACKEND_RISCV_ZVFH
485 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT16_MAX, 1);
486
487 while (n > 0) {
488 size_t vl = __riscv_vsetvl_e16m1(n);
489 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
490 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
491 x_data += vl;
492 n -= vl;
493 }
494 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
495 #else // scalar implementation
496 y->data = -FLT16_MAX;
497 for (size_t i = 0; i < n; i += 1) {
498 float val = as_f32(x_data[i]);
499 y->data = val > y->data ? val : y->data;
500 }
501 #endif
502}
503
504
514 size_t n = x->shape[0];
515 float16_t *x_data = x->data;
516
517 #ifdef CONFIG_BACKEND_RISCV_ZVFH
518 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(FLT16_MAX, 1);
519
520 while (n > 0) {
521 size_t vl = __riscv_vsetvl_e16m1(n);
522 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
523 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
524 x_data += vl;
525 n -= vl;
526 }
527 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
528 #else // scalar implementation
529 y->data = FLT16_MAX;
530 for (size_t i = 0; i < n; i += 1) {
531 float val = as_f32(x_data[i]);
532 y->data = val < y->data ? val : y->data;
533 }
534 #endif
535}
536
537
547 size_t n = x->shape[0] * x->shape[1];
548 float16_t *x_data = x->data;
549
550 #ifdef CONFIG_BACKEND_RISCV_ZVFH
551 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(FLT16_MAX, 1);
552
553 while (n > 0) {
554 size_t vl = __riscv_vsetvl_e16m1(n);
555 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
556 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
557 x_data += vl;
558 n -= vl;
559 }
560 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
561 #else // scalar implementation
562 y->data = FLT16_MAX;
563 for (size_t i = 0; i < n; i += 1) {
564 float val = as_f32(x_data[i]);
565 y->data = val < y->data ? val : y->data;
566 }
567 #endif
568}
569
570
571/* ======================================================================================================== */
572/* Addition */
573/* ======================================================================================================== */
585void nn_add1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2) {
586 nn_assert(x1->shape[0] == x2->shape[0], "Cannot add tensors of different shapes");
587 nn_assert(y->shape[0] == x1->shape[0], "Cannot add tensors of different shapes");
588
589 size_t n = y->shape[0];
590 float16_t *x1_data = x1->data;
591 float16_t *x2_data = x2->data;
592 float16_t *y_data = y->data;
593
594 #ifdef CONFIG_BACKEND_RISCV_ZVFH
595 while (n > 0) {
596 size_t vl = __riscv_vsetvl_e16m1(n);
597 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
598 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
599 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
600 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
601 x1_data += vl;
602 x2_data += vl;
603 y_data += vl;
604 n -= vl;
605 }
606 #else // scalar implementation
607 for (size_t i = 0; i < n; i += 1) {
608 y_data[i] = as_f16(as_f32(x1_data[i]) + as_f32(x2_data[i]));
609 }
610 #endif
611}
612
613
625void nn_add2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
626 nn_assert(x1->shape[0] == x2->shape[0] && x1->shape[1] == x2->shape[1], "Cannot add tensors of different shapes");
627 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x1->shape[1], "Cannot add tensors of different shapes");
628
629 size_t n = y->shape[0] * y->shape[1];
630 float16_t *x1_data = x1->data;
631 float16_t *x2_data = x2->data;
632 float16_t *y_data = y->data;
633
634 #ifdef CONFIG_BACKEND_RISCV_ZVFH
635 while (n > 0) {
636 size_t vl = __riscv_vsetvl_e16m1(n);
637 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
638 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
639 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
640 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
641 x1_data += vl;
642 x2_data += vl;
643 y_data += vl;
644 n -= vl;
645 }
646 #else // scalar implementation
647 for (size_t i = 0; i < n; i += 1) {
648 y_data[i] = as_f16(as_f32(x1_data[i]) + as_f32(x2_data[i]));
649 }
650 #endif
651}
652
663 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
664
665 size_t n = y->shape[0];
666 float16_t *x_data = x->data;
667 float16_t *y_data = y->data;
668
669 #ifdef CONFIG_BACKEND_RISCV_ZVFH
670 while (n > 0) {
671 size_t vl = __riscv_vsetvl_e16m1(n);
672 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
673 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
674 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
675 x_data += vl;
676 y_data += vl;
677 n -= vl;
678 }
679 #else // scalar implementation
680 for (size_t i = 0; i < n; i += 1) {
681 y_data[i] = as_f16(as_f32(x_data[i]) + as_f32(scalar));
682 }
683 #endif
684}
685
696 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == x->shape[1], "Cannot add tensors of different shapes");
697
698 size_t n = y->shape[0] * y->shape[1];
699 float16_t *x_data = x->data;
700 float16_t *y_data = y->data;
701
702 #ifdef CONFIG_BACKEND_RISCV_ZVFH
703 while (n > 0) {
704 size_t vl = __riscv_vsetvl_e16m1(n);
705 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
706 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
707 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
708 x_data += vl;
709 y_data += vl;
710 n -= vl;
711 }
712 #else // scalar implementation
713 for (size_t i = 0; i < n; i += 1) {
714 y_data[i] = as_f16(as_f32(x_data[i]) + as_f32(scalar));
715 }
716 #endif
717}
718
719
720
721
722/* ======================================================================================================== */
723/* Multiplication */
724/* ======================================================================================================== */
725
735void nn_mul1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2) {
736 nn_assert(x1->shape[0] == x2->shape[0], "Cannot add tensors of different shapes");
737 nn_assert(y->shape[0] == x1->shape[0], "Cannot add tensors of different shapes");
738
739 size_t n = y->shape[0];
740 float16_t *x1_data = x1->data;
741 float16_t *x2_data = x2->data;
742 float16_t *y_data = y->data;
743
744 #ifdef CONFIG_BACKEND_RISCV_ZVFH
745 while (n > 0) {
746 size_t vl = __riscv_vsetvl_e16m1(n);
747 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
748 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
749 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
750 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
751 x1_data += vl;
752 x2_data += vl;
753 y_data += vl;
754 n -= vl;
755 }
756 #else // scalar implementation
757 for (size_t i = 0; i < n; i += 1) {
758 y_data[i] = as_f16(as_f32(x1_data[i]) * as_f32(x2_data[i]));
759 }
760 #endif
761}
762
772void nn_mul2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
773 nn_assert(x1->shape[0] == x2->shape[0] && x1->shape[1] == x2->shape[1], "Cannot add tensors of different shapes");
774 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x1->shape[1], "Cannot add tensors of different shapes");
775
776 size_t n = y->shape[0] * y->shape[1];
777 float16_t *x1_data = x1->data;
778 float16_t *x2_data = x2->data;
779 float16_t *y_data = y->data;
780
781 #ifdef CONFIG_BACKEND_RISCV_ZVFH
782 while (n > 0) {
783 size_t vl = __riscv_vsetvl_e16m1(n);
784 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
785 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
786 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
787 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
788 x1_data += vl;
789 x2_data += vl;
790 y_data += vl;
791 n -= vl;
792 }
793 #else // scalar implementation
794 for (size_t i = 0; i < n; i += 1) {
795 y_data[i] = as_f16(as_f32(x1_data[i]) * as_f32(x2_data[i]));
796 }
797 #endif
798}
799
810 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
811
812 size_t n = y->shape[0];
813 float16_t *x_data = x->data;
814 float16_t *y_data = y->data;
815
816 #ifdef CONFIG_BACKEND_RISCV_ZVFH
817 while (n > 0) {
818 size_t vl = __riscv_vsetvl_e16m1(n);
819 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
820 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
821 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
822 x_data += vl;
823 y_data += vl;
824 n -= vl;
825 }
826 #else // scalar implementation
827 for (size_t i = 0; i < n; i += 1) {
828 y_data[i] = as_f16(as_f32(x_data[i]) * as_f32(scalar));
829 }
830 #endif
831}
832
843 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == x->shape[1], "Cannot add tensors of different shapes");
844
845 size_t n = y->shape[0] * y->shape[1];
846 float16_t *x_data = x->data;
847 float16_t *y_data = y->data;
848
849 #ifdef CONFIG_BACKEND_RISCV_ZVFH
850 while (n > 0) {
851 size_t vl = __riscv_vsetvl_e16m1(n);
852 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
853 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
854 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
855 x_data += vl;
856 y_data += vl;
857 n -= vl;
858 }
859 #else // scalar implementation
860 for (size_t i = 0; i < n; i += 1) {
861 y_data[i] = as_f16(as_f32(x_data[i]) * as_f32(scalar));
862 }
863 #endif
864}
865
866
867
868/* ======================================================================================================== */
869/* MatMul */
870/* ======================================================================================================== */
880void nn_dot_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2) {
881 nn_assert(x1->shape[0] == x2->shape[0], "Cannot dot tensors of different shapes");
882 nn_assert(y->shape[0] == x1->shape[0], "Cannot dot tensors of different shapes");
883
884 size_t n = y->shape[0];
885 float16_t *x1_data = x1->data;
886 float16_t *x2_data = x2->data;
887 float16_t *y_data = y->data;
888
889 float sum_f32 = 0;
890 for (size_t i = 0; i < n; i += 1) {
891 sum_f32 += as_f32(x1_data[i]) * as_f32(x2_data[i]);
892 }
893 y_data[0] = as_f16(sum_f32);
894}
895
905void nn_mm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
906 nn_assert(x1->shape[1] == x2->shape[0], "Cannot perform MatMul on tensors of different shapes");
907 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x2->shape[1], "Cannot perform MatMul on tensors of different shapes");
908
909 const size_t n = x1->shape[0];
910 const size_t m = x1->shape[1];
911 const size_t p = x2->shape[1];
912
913 for (size_t i = 0; i < n; i += 1) {
914 float16_t *x1_row = x1->data + i * m;
915 float16_t *y_row = y->data + i * p;
916
917 #ifdef CONFIG_BACKEND_RISCV_ZVFH
918
919 size_t vlmax = __riscv_vsetvlmax_e16m1();
920 for (size_t j = 0; j < p; j += 1) {
921 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
922 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
923
924 float16_t *x2_col = x2->data + j;
925 size_t k = m;
926
927 while (k > 0) {
928 size_t vl = __riscv_vsetvl_e16m1(k);
929 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_row, vl);
930 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_col, p * sizeof(float16_t), vl);
931 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
932
933 x1_row += vl;
934 x2_col += vl * p;
935 k -= vl;
936 }
937
938 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
939 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
940 #else
941 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
942 #endif
943 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
944 }
945 #else
946 for (size_t j = 0; j < p; j += 1) {
947 float16_t *x2_col = x2->data + j;
948
949 float sum = 0.f;
950 for (size_t k = 0; k < m; k += 1) {
951 sum += as_f32(x1_row[k]) * as_f32(x2_col[k * p]);
952 }
953 y_row[j] = as_f16(sum);
954 }
955 #endif
956 }
957}
958
969void nn_addmm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *c, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
970 nn_assert(x1->shape[1] == x2->shape[0], "Cannot perform Linear on tensors of different shapes");
971 nn_assert(y->shape[0] == c->shape[0] && y->shape[1] == x2->shape[1], "Cannot perform Linear on tensors of different shapes");
972
973 const size_t n = x1->shape[0];
974 const size_t m = x1->shape[1];
975 const size_t p = x2->shape[1];
976
977 for (size_t i = 0; i < n; i += 1) {
978 float16_t *x1_row = x1->data + i * m;
979 float16_t *c_row = c->data + i * p;
980 float16_t *y_row = y->data + i * p;
981
982 #ifdef CONFIG_BACKEND_RISCV_ZVFH
983
984 size_t vlmax = __riscv_vsetvlmax_e16m1();
985 for (size_t j = 0; j < p; j += 1) {
986 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
987 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
988
989 float16_t *x2_col = x2->data + j;
990 size_t k = m;
991
992 while (k > 0) {
993 size_t vl = __riscv_vsetvl_e16m1(k);
994 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_row, vl);
995 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_col, p * sizeof(float16_t), vl);
996 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
997
998 x1_row += vl;
999 x2_col += vl * p;
1000 k -= vl;
1001 }
1002
1003 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1004 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1005 #else
1006 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1007 #endif
1008 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum) + c_row[j];
1009 }
1010
1011 x1_row += m;
1012 y_row += p;
1013 #else
1014
1015 for (size_t j = 0; j < p; j += 1) {
1016 float16_t *x2_col = x2->data + j;
1017
1018 float sum = 0.f;
1019 for (size_t k = 0; k < m; k += 1) {
1020 sum += as_f32(x1_row[k]) * as_f32(x2_col[k * p]);
1021 }
1022 y_row[j] = as_f16(sum + as_f32(c_row[j]));
1023 }
1024 #endif
1025 }
1026}
1027
1040void nn_linear_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, const Tensor2D_F16 *weight, const Tensor1D_F16 *bias) {
1041 nn_assert(x->shape[1] == weight->shape[1], "Cannot perform Linear on tensors of different shapes");
1042 nn_assert(!bias || bias->shape[0] == weight->shape[0], "Cannot perform Linear on tensors of different shapes");
1043 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == weight->shape[0], "Cannot perform Linear on tensors of different shapes");
1044
1045 const size_t batch_size = x->shape[0];
1046 const size_t in_features = x->shape[1];
1047 const size_t out_features = weight->shape[0];
1048
1049 float16_t *x_batch_data = x->data;
1050 float16_t *y_batch_data = y->data;
1051
1052 for (size_t i = 0; i < batch_size; i += 1) {
1053 float16_t *x_data = x_batch_data;
1054 float16_t *y_data = y_batch_data;
1055
1056 #ifdef CONFIG_BACKEND_RISCV_ZVFH
1057
1058 size_t vlmax = __riscv_vsetvlmax_e16m1();
1059
1060 for (size_t j = 0; j < out_features; j += 1) {
1061 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
1062 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
1063
1064 float16_t *weight_row = weight->data + j * in_features;
1065 size_t n = in_features;
1066
1067 while (n > 0) {
1068 size_t vl = __riscv_vsetvl_e16m1(n);
1069 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
1070 vfloat16m1_t vec_w = __riscv_vle16_v_f16m1(weight_row, vl);
1071 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x, vec_w, vl);
1072
1073 x_data += vl;
1074 weight_row += vl;
1075 n -= vl;
1076 }
1077
1078 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1079 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1080 #else
1081 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
1082 #endif
1083
1084 float16_t sum = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
1085 if (bias) {
1086 sum = as_f16(as_f32(sum) + as_f32(bias->data[j]));
1087 }
1088 y_data[j] = sum;
1089 x_data = x_batch_data; // reset x_data pointer for next output feature
1090 }
1091
1092 #else // scalar implementation
1093 for (size_t j = 0; j < out_features; j += 1) {
1094 float16_t *weight_row = weight->data + j * in_features;
1095
1096 float sum = 0.f;
1097 for (size_t k = 0; k < in_features; k += 1) {
1098 sum += as_f32(x_data[k]) * as_f32(weight_row[k]);
1099 }
1100 if (bias) {
1101 sum += as_f32(bias->data[j]);
1102 }
1103 y_data[j] = as_f16(sum);
1104 }
1105 #endif
1106
1107 x_batch_data += in_features;
1108 y_batch_data += out_features;
1109 }
1110}
1111
1112
1113
1114/* ======================================================================================================== */
1115/* Non-linear */
1116/* ======================================================================================================== */
1117
1127void nn_elu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float alpha) {
1128 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ELU on tensors of different shapes");
1129
1130 const size_t n = y->shape[0] * y->shape[1];
1131 float16_t *x_data = x->data;
1132 float16_t *y_data = y->data;
1133
1134 for (size_t i = 0; i < n; i += 1) {
1135 if (as_f32(x_data[i]) > 0) {
1136 y_data[i] = x_data[i];
1137 }
1138 else {
1139 y_data[i] = as_f16(alpha * (expf(as_f32(x_data[i])) - 1.f));
1140 }
1141 }
1142}
1143
1153 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ReLU on tensors of different shapes");
1154
1155 size_t n = y->shape[0] * y->shape[1];
1156 float16_t *x_data = x->data;
1157 float16_t *y_data = y->data;
1158
1159 #ifdef CONFIG_BACKEND_RISCV_ZVFH
1160 float16_t zero = 0.0f;
1161
1162 while (n > 0) {
1163 size_t vl = __riscv_vsetvl_e16m1(n);
1164 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
1165 vfloat16m1_t vec_y = __riscv_vfmax_vf_f16m1(vec_x, zero, vl);
1166 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
1167 x_data += vl;
1168 y_data += vl;
1169 n -= vl;
1170 }
1171 #else // scalar implementation
1172 for (size_t i = 0; i < n; i += 1) {
1173 float x_val = as_f32(x_data[i]);
1174 y_data[i] = x_val > 0 ? as_f16(x_val) : 0;
1175 }
1176 #endif
1177}
1178
1188 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
1189
1190 size_t n = y->shape[0];
1191 float16_t *x_data = x->data;
1192 float16_t *y_data = y->data;
1193
1194 float sum = 0.0f;
1195 for (size_t i = 0; i < n; i += 1) {
1196 sum += expf(as_f32(x_data[i]));
1197 }
1198
1199 for (size_t i = 0; i < n; i += 1) {
1200 y_data[i] = as_f16(expf(as_f32(x_data[i])) / sum);
1201 }
1202}
1203
1213 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ReLU on tensors of different shapes");
1214
1215 const size_t n = y->shape[0] * y->shape[1];
1216 float16_t *x_data = x->data;
1217 float16_t *y_data = y->data;
1218
1219 for (size_t i = 0; i < n; i += 1) {
1220 float x_val = as_f32(x_data[i]);
1221 y_data[i] = as_f16(tanh(x_val));
1222 }
1223}
1224
1225
1226
1227#endif // __NN_F32_H
static float16_t as_f16(float f)
Definition: float16.h:116
static float as_f32(float16_t h)
Definition: float16.h:50
#define FLT16_MAX
Definition: float16.h:27
_Float16 float16_t
Definition: float16.h:33
Half-Precision Floating-Point (fp16) Definitions.
void nn_print_f32(float v, int16_t num_digits)
Definition: nn.h:96
static void nn_assert(int condition, char *message)
Definition: nn.h:59
void nn_min2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Finds the minimum value in a 2D tensor with type F16.
Definition: nn_f16.h:546
void nn_mulscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Multiplies a scalar with a 2D tensor with type F16.
Definition: nn_f16.h:842
Tensor2D_F16 * nn_tensor2d_f16(size_t shape[2], const float16_t *data)
Creates a 2D tensor with type F16.
Definition: nn_f16.h:111
Tensor2D_F16 * nn_zeros2d_f16(size_t shape[2])
Creates a 2D tensor with type F16 and initializes it to 0.
Definition: nn_f16.h:161
void nn_softmax1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x)
Applies the softmax activation function to a 1D tensor with type F16.
Definition: nn_f16.h:1187
void nn_min1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Finds the minimum value in a 1D tensor with type F16.
Definition: nn_f16.h:513
void nn_print_f16(float16_t v, int16_t num_digits)
Prints a half-precision floating-point number.
Definition: nn_f16.h:323
void nn_addscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Adds a scalar to a 1D tensor with type F16.
Definition: nn_f16.h:662
void nn_mul1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:735
void nn_addscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Adds a scalar to a 2D tensor with type F16.
Definition: nn_f16.h:695
void nn_mul2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:772
void nn_tanh2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Applies the tanh activation function to a 2D tensor with type F16.
Definition: nn_f16.h:1212
Tensor0D_F16 * nn_zeros0d_f16()
Creates a 0D tensor with type F16 and initializes it to 0.
Definition: nn_f16.h:131
static uint8_t nn_equal_f16(float16_t golden, float16_t actual, float rel_err)
Checks if two half-precision floating-point numbers are equal within a relative error.
Definition: nn_f16.h:59
Tensor1D_F16 * nn_full1d_f16(size_t shape[1], float16_t data)
Creates a 1D tensor with type F16 and initializes it to a specific value.
Definition: nn_f16.h:238
Tensor2D_F16 * nn_full2d_f16(size_t shape[2], float16_t data)
Creates a 2D tensor with type F16 and initializes it to a specific value.
Definition: nn_f16.h:256
void nn_print_tensor1d_f16(const Tensor1D_F16 *tensor)
Prints the content of a 1D tensor with type F16.
Definition: nn_f16.h:334
Tensor0D_F16 * nn_rand0d_f16()
Creates a 0D tensor with type F16 and initializes it to a random value.
Definition: nn_f16.h:272
void nn_mm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Performs a matrix multiplication of the matrices x1 and x2.
Definition: nn_f16.h:905
void nn_print_tensor2d_f16(const Tensor2D_F16 *tensor)
Prints the content of a 2D tensor with type F16.
Definition: nn_f16.h:352
Tensor2D_F16 * nn_rand2d_f16(size_t shape[2])
Creates a 2D tensor with type F16 and initializes it to a random value.
Definition: nn_f16.h:302
Tensor2D_F16 * nn_ones2d_f16(size_t shape[2])
Creates a 2D tensor with type F16 and initializes it to 1.
Definition: nn_f16.h:207
void nn_addmm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *c, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Performs a matrix multiplication of the matrices x1 and x2.
Definition: nn_f16.h:969
uint8_t nn_equals2d_f16(const Tensor2D_F16 *a, const Tensor2D_F16 *b, float rel_err)
Checks if two 2D tensors with type F16 are equal.
Definition: nn_f16.h:423
uint8_t nn_equals0d_f16(const Tensor0D_F16 *a, const Tensor0D_F16 *b, float rel_err)
Checks if two 0D tensors with type F16 are equal.
Definition: nn_f16.h:387
Tensor1D_F16 * nn_ones1d_f16(size_t shape[1])
Creates a 1D tensor with type F16 and initializes it to 1.
Definition: nn_f16.h:190
Tensor1D_F16 * nn_zeros1d_f16(size_t shape[1])
Creates a 1D tensor with type F16 and initializes it to 0.
Definition: nn_f16.h:144
void nn_add1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:585
void nn_max2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Finds the maximum value in a 2D tensor with type F16.
Definition: nn_f16.h:480
void nn_mulscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Multiplies a scalar with a 1D tensor with type F16.
Definition: nn_f16.h:809
void nn_elu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float alpha)
Applies the ELU activation function to a 2D tensor with type F16.
Definition: nn_f16.h:1127
void nn_dot_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Performs a dot product of two 1D tensors with type F16.
Definition: nn_f16.h:880
void nn_linear_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, const Tensor2D_F16 *weight, const Tensor1D_F16 *bias)
Linear neural network layer.
Definition: nn_f16.h:1040
void nn_add2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f16.h:625
void nn_relu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Applies the ReLU activation function to a 2D tensor with type F16.
Definition: nn_f16.h:1152
void nn_max1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Finds the maximum value in a 1D tensor with type F16.
Definition: nn_f16.h:448
uint8_t nn_equals1d_f16(const Tensor1D_F16 *a, const Tensor1D_F16 *b, float rel_err)
Checks if two 1D tensors with type F16 are equal.
Definition: nn_f16.h:401
Tensor1D_F16 * nn_tensor1d_f16(size_t shape[1], const float16_t *data)
Creates a 1D tensor with type F16.
Definition: nn_f16.h:90
Tensor0D_F16 * nn_ones0d_f16()
Creates a 0D tensor with type F16 and initializes it to 1.
Definition: nn_f16.h:177
Tensor0D_F16 * nn_full0d_f16(float16_t data)
Creates a 0D tensor with type F16 and initializes it to a specific value.
Definition: nn_f16.h:224
Tensor0D_F16 * nn_tensor0d_f16(float16_t data)
Creates a 0D tensor with type F16.
Definition: nn_f16.h:75
Tensor1D_F16 * nn_rand1d_f16(size_t shape[1])
Creates a 1D tensor with type F16 and initializes it to a random value.
Definition: nn_f16.h:285
float16_t data
Definition: nn_f16.h:24
A 0D tensor (scalar) with a half-precision floating-point data type.
Definition: nn_f16.h:23
size_t shape[1]
Definition: nn_f16.h:34
float16_t * data
Definition: nn_f16.h:35
A 1D tensor with a half-precision floating-point data type.
Definition: nn_f16.h:33
size_t shape[2]
Definition: nn_f16.h:45
float16_t * data
Definition: nn_f16.h:46
A 2D tensor with a half-precision floating-point data type.
Definition: nn_f16.h:44