Baremetal-NN
Baremetal-NN API documentation
Loading...
Searching...
No Matches
nn_f16.h
Go to the documentation of this file.
1
8#ifndef __NN_F16_H
9#define __NN_F16_H
10
11#include "float16.h"
12
13
14#ifdef CONFIG_BACKEND_RISCV_V
15 #include "riscv_vector.h"
16#endif
17
23typedef struct {
26
27
33typedef struct {
34 size_t shape[1];
37
38
44typedef struct {
45 size_t shape[2];
48
59static inline uint8_t nn_equal_f16(float16_t golden, float16_t actual, float rel_err) {
60 return (fabs(as_f32(actual) - as_f32(golden)) < rel_err) || (fabs((as_f32(actual) - as_f32(golden)) / as_f32(actual)) < rel_err);
61}
62
63
64/* ======================================================================================================== */
65/* Tensor Creation */
66/* ======================================================================================================== */
75 Tensor0D_F16 *tensor = (Tensor0D_F16 *)malloc(sizeof(Tensor0D_F16));
76 tensor->data = data;
77 return tensor;
78}
79
88Tensor1D_F16 *nn_tensor1d_f16(size_t shape[1], const float16_t *data) {
89 Tensor1D_F16 *tensor = (Tensor1D_F16 *)malloc(sizeof(Tensor1D_F16));
90 tensor->shape[0] = shape[0];
91
92 size_t n_bytes = shape[0] * sizeof(float16_t);
93 tensor->data = (float16_t *)malloc(n_bytes);
94 if (data != NULL) {
95 memcpy(tensor->data, data, n_bytes);
96 }
97 return tensor;
98}
99
108Tensor2D_F16 *nn_tensor2d_f16(size_t shape[2], const float16_t *data) {
109 Tensor2D_F16 *tensor = (Tensor2D_F16 *)malloc(sizeof(Tensor2D_F16));
110 tensor->shape[0] = shape[0];
111 tensor->shape[1] = shape[1];
112
113 size_t n_bytes = shape[0] * shape[1] * sizeof(float16_t);
114 tensor->data = (float16_t *)malloc(n_bytes);
115 if (data != NULL) {
116 memcpy(tensor->data, data, n_bytes);
117 }
118 return tensor;
119}
120
122 Tensor0D_F16 *tensor = nn_tensor0d_f16(0);
123 return tensor;
124}
125
126Tensor1D_F16 *nn_zeros1d_f16(size_t shape[1]) {
127 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
128 size_t n = shape[0];
129 for (size_t i = 0; i < n; i += 1) {
130 tensor->data[i] = 0;
131 }
132 return tensor;
133}
134
135Tensor2D_F16 *nn_zeros2d_f16(size_t shape[2]) {
136 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
137 size_t n = shape[0] * shape[1];
138 for (size_t i = 0; i < n; i += 1) {
139 tensor->data[i] = 0;
140 }
141 return tensor;
142}
143
145 Tensor0D_F16 *tensor = nn_tensor0d_f16(1);
146 return tensor;
147}
148
149Tensor1D_F16 *nn_ones1d_f16(size_t shape[1]) {
150 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
151 size_t n = shape[0];
152 for (size_t i = 0; i < n; i += 1) {
153 tensor->data[i] = 1;
154 }
155 return tensor;
156}
157
158Tensor2D_F16 *nn_ones2d_f16(size_t shape[2]) {
159 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
160 size_t n = shape[0] * shape[1];
161 for (size_t i = 0; i < n; i += 1) {
162 tensor->data[i] = 1;
163 }
164 return tensor;
165}
166
168 Tensor0D_F16 *tensor = nn_tensor0d_f16(data);
169 return tensor;
170}
171
172Tensor1D_F16 *nn_full1d_f16(size_t shape[1], float16_t data) {
173 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
174 size_t n = shape[0];
175 for (size_t i = 0; i < n; i += 1) {
176 tensor->data[i] = data;
177 }
178 return tensor;
179}
180
181Tensor2D_F16 *nn_full2d_f16(size_t shape[2], float16_t data) {
182 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
183 size_t n = shape[0] * shape[1];
184 for (size_t i = 0; i < n; i += 1) {
185 tensor->data[i] = data;
186 }
187 return tensor;
188}
189
191 Tensor0D_F16 *tensor = nn_tensor0d_f16(as_f16(rand()));
192 return tensor;
193}
194
195Tensor1D_F16 *nn_rand1d_f16(size_t shape[1]) {
196 Tensor1D_F16 *tensor = nn_tensor1d_f16(shape, NULL);
197 size_t n = shape[0];
198 for (size_t i = 0; i < n; i += 1) {
199 tensor->data[i] = as_f16(rand());
200 }
201 return tensor;
202}
203
204Tensor2D_F16 *nn_rand2d_f16(size_t shape[2]) {
205 Tensor2D_F16 *tensor = nn_tensor2d_f16(shape, NULL);
206 size_t n = shape[0] * shape[1];
207 for (size_t i = 0; i < n; i += 1) {
208 tensor->data[i] = as_f16(rand());
209 }
210 return tensor;
211}
212
213
214/* ======================================================================================================== */
215/* Tensor Prints */
216/* ======================================================================================================== */
225void nn_print_f16(float16_t v, int16_t num_digits) {
226 nn_print_f32(as_f32(v), num_digits);
227}
228
229
239 printf("[");
240 for (size_t i=0; i<tensor->shape[0]; i+=1) {
241 nn_print_f16(*((float16_t *)tensor->data + i), 3);
242 if (i < tensor->shape[0]-1) {
243 printf(" ");
244 }
245 }
246 printf("]\n");
247}
248
257 printf("[");
258 for (size_t i=0; i<tensor->shape[0]; i+=1) {
259 if (i != 0) {
260 printf(" ");
261 }
262 printf("[");
263 for (size_t j=0; j<tensor->shape[1]; j+=1) {
264 nn_print_f16(*((float16_t *)tensor->data + i*tensor->shape[1] + j), 3);
265 if (j < tensor->shape[1]-1) {
266 printf(" ");
267 }
268 }
269 printf("]");
270 if (i < tensor->shape[0]-1) {
271 printf("\n");
272 }
273 }
274 printf("]\n");
275}
276
277
278/* ======================================================================================================== */
279/* Comparision */
280/* ======================================================================================================== */
291uint8_t nn_equals0d_f16(const Tensor0D_F16 *a, const Tensor0D_F16 *b, float rel_err) {
292 return nn_equal_f16(a->data, b->data, rel_err);
293}
294
305uint8_t nn_equals1d_f16(const Tensor1D_F16 *a, const Tensor1D_F16 *b, float rel_err) {
306 nn_assert(a->shape[0] == b->shape[0], "Cannot compare tensors of different shapes");
307
308 size_t n = a->shape[0];
309 for (size_t i = 0; i < n; i += 1) {
310 if (!nn_equal_f16(a->data[i], b->data[i], rel_err)) {
311 return 0;
312 }
313 }
314 return 1;
315}
316
327uint8_t nn_equals2d_f16(const Tensor2D_F16 *a, const Tensor2D_F16 *b, float rel_err) {
328 nn_assert(a->shape[0] == b->shape[0] && a->shape[1] == b->shape[1], "Cannot compare tensors of different shapes");
329
330 size_t n = a->shape[0] * a->shape[1];
331 for (size_t i = 0; i < n; i += 1) {
332 if (!nn_equal_f16(a->data[i], b->data[i], rel_err)) {
333 return 0;
334 }
335 }
336 return 1;
337}
338
339
340
341/* ======================================================================================================== */
342/* Unary */
343/* ======================================================================================================== */
345 size_t n = x->shape[0];
346 float16_t *x_data = x->data;
347
348 #ifdef CONFIG_BACKEND_RISCV_ZVFH
349 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT16_MAX, 1);
350
351 while (n > 0) {
352 size_t vl = __riscv_vsetvl_e16m1(n);
353 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
354 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
355 x_data += vl;
356 n -= vl;
357 }
358 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
359 #else // scalar implementation
360 y->data = -FLT16_MAX;
361 for (size_t i = 0; i < n; i += 1) {
362 float val = as_f32(x->data[i]);
363 y->data = val > y->data ? val : y->data;
364 }
365 #endif
366}
367
369 size_t n = x->shape[0] * x->shape[1];
370 float16_t *x_data = x->data;
371
372 #ifdef CONFIG_BACKEND_RISCV_ZVFH
373 vfloat16m1_t vec_max = __riscv_vfmv_v_f_f16m1(-FLT16_MAX, 1);
374
375 while (n > 0) {
376 size_t vl = __riscv_vsetvl_e16m1(n);
377 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
378 vec_max = __riscv_vfredmax_vs_f16m1_f16m1(vec_x, vec_max, vl);
379 x_data += vl;
380 n -= vl;
381 }
382 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_max);
383 #else // scalar implementation
384 y->data = -FLT16_MAX;
385 for (size_t i = 0; i < n; i += 1) {
386 float val = as_f32(x->data[i]);
387 y->data = val > y->data ? val : y->data;
388 }
389 #endif
390}
391
392
394 size_t n = x->shape[0];
395 float16_t *x_data = x->data;
396
397 #ifdef CONFIG_BACKEND_RISCV_ZVFH
398 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(FLT16_MAX, 1);
399
400 while (n > 0) {
401 size_t vl = __riscv_vsetvl_e16m1(n);
402 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
403 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
404 x_data += vl;
405 n -= vl;
406 }
407 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
408 #else // scalar implementation
409 y->data = FLT16_MAX;
410 for (size_t i = 0; i < n; i += 1) {
411 float val = as_f32(x->data[i]);
412 y->data = val < y->data ? val : y->data;
413 }
414 #endif
415}
416
417
419 size_t n = x->shape[0] * x->shape[1];
420 float16_t *x_data = x->data;
421
422 #ifdef CONFIG_BACKEND_RISCV_ZVFH
423 vfloat16m1_t vec_min = __riscv_vfmv_v_f_f16m1(FLT16_MAX, 1);
424
425 while (n > 0) {
426 size_t vl = __riscv_vsetvl_e16m1(n);
427 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
428 vec_min = __riscv_vfredmin_vs_f16m1_f16m1(vec_x, vec_min, vl);
429 x_data += vl;
430 n -= vl;
431 }
432 y->data = __riscv_vfmv_f_s_f16m1_f16(vec_min);
433 #else // scalar implementation
434 y->data = FLT16_MAX;
435 for (size_t i = 0; i < n; i += 1) {
436 float val = as_f32(x->data[i]);
437 y->data = val < y->data ? val : y->data;
438 }
439 #endif
440}
441
442
443/* ======================================================================================================== */
444/* Addition */
445/* ======================================================================================================== */
457void nn_add1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2) {
458 nn_assert(x1->shape[0] == x2->shape[0], "Cannot add tensors of different shapes");
459 nn_assert(y->shape[0] == x1->shape[0], "Cannot add tensors of different shapes");
460
461 size_t n = y->shape[0];
462 float16_t *x1_data = x1->data;
463 float16_t *x2_data = x2->data;
464 float16_t *y_data = y->data;
465
466 #ifdef CONFIG_BACKEND_RISCV_ZVFH
467 while (n > 0) {
468 size_t vl = __riscv_vsetvl_e16m1(n);
469 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
470 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
471 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
472 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
473 x1_data += vl;
474 x2_data += vl;
475 y_data += vl;
476 n -= vl;
477 }
478 #else // scalar implementation
479 for (size_t i = 0; i < n; i += 1) {
480 y->data[i] = as_f16(as_f32(x1->data[i]) + as_f32(x2->data[i]));
481 }
482 #endif
483}
484
485
486
498void nn_add2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
499 nn_assert(x1->shape[0] == x2->shape[0] && x1->shape[1] == x2->shape[1], "Cannot add tensors of different shapes");
500 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x1->shape[1], "Cannot add tensors of different shapes");
501
502 size_t n = y->shape[0] * y->shape[1];
503 float16_t *x1_data = x1->data;
504 float16_t *x2_data = x2->data;
505 float16_t *y_data = y->data;
506
507 #ifdef CONFIG_BACKEND_RISCV_ZVFH
508 while (n > 0) {
509 size_t vl = __riscv_vsetvl_e16m1(n);
510 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
511 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
512 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
513 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
514 x1_data += vl;
515 x2_data += vl;
516 y_data += vl;
517 n -= vl;
518 }
519 #else // scalar implementation
520 for (size_t i = 0; i < n; i += 1) {
521 y->data[i] = as_f16(as_f32(x1->data[i]) + as_f32(x2->data[i]));
522 }
523 #endif
524}
525
527 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
528
529 size_t n = y->shape[0];
530 float16_t *x_data = x->data;
531 float16_t *y_data = y->data;
532
533 #ifdef CONFIG_BACKEND_RISCV_ZVFH
534 while (n > 0) {
535 size_t vl = __riscv_vsetvl_e16m1(n);
536 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
537 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
538 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
539 x_data += vl;
540 y_data += vl;
541 n -= vl;
542 }
543 #else // scalar implementation
544 for (size_t i = 0; i < n; i += 1) {
545 y->data[i] = as_f16(as_f32(x->data[i]) + as_f32(scalar));
546 }
547 #endif
548}
549
551 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == x->shape[1], "Cannot add tensors of different shapes");
552
553 size_t n = y->shape[0] * y->shape[1];
554 float16_t *x_data = x->data;
555 float16_t *y_data = y->data;
556
557 #ifdef CONFIG_BACKEND_RISCV_ZVFH
558 while (n > 0) {
559 size_t vl = __riscv_vsetvl_e16m1(n);
560 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
561 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
562 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
563 x_data += vl;
564 y_data += vl;
565 n -= vl;
566 }
567 #else // scalar implementation
568 for (size_t i = 0; i < n; i += 1) {
569 y->data[i] = as_f16(as_f32(x->data[i]) + as_f32(scalar));
570 }
571 #endif
572}
573
574
575
576
577/* ======================================================================================================== */
578/* Multiplication */
579/* ======================================================================================================== */
580
581
582void nn_mul1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2) {
583 nn_assert(x1->shape[0] == x2->shape[0], "Cannot add tensors of different shapes");
584 nn_assert(y->shape[0] == x1->shape[0], "Cannot add tensors of different shapes");
585
586 size_t n = y->shape[0];
587 float16_t *x1_data = x1->data;
588 float16_t *x2_data = x2->data;
589 float16_t *y_data = y->data;
590
591 #ifdef CONFIG_BACKEND_RISCV_ZVFH
592 while (n > 0) {
593 size_t vl = __riscv_vsetvl_e16m1(n);
594 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
595 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
596 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
597 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
598 x1_data += vl;
599 x2_data += vl;
600 y_data += vl;
601 n -= vl;
602 }
603 #else // scalar implementation
604 for (size_t i = 0; i < n; i += 1) {
605 y->data[i] = as_f16(as_f32(x1->data[i]) * as_f32(x2->data[i]));
606 }
607 #endif
608}
609
610
611void nn_mul2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
612 nn_assert(x1->shape[0] == x2->shape[0] && x1->shape[1] == x2->shape[1], "Cannot add tensors of different shapes");
613 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x1->shape[1], "Cannot add tensors of different shapes");
614
615 size_t n = y->shape[0] * y->shape[1];
616 float16_t *x1_data = x1->data;
617 float16_t *x2_data = x2->data;
618 float16_t *y_data = y->data;
619
620 #ifdef CONFIG_BACKEND_RISCV_ZVFH
621 while (n > 0) {
622 size_t vl = __riscv_vsetvl_e16m1(n);
623 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_data, vl);
624 vfloat16m1_t vec_x2 = __riscv_vle16_v_f16m1(x2_data, vl);
625 vfloat16m1_t vec_y = __riscv_vfadd_vv_f16m1(vec_x1, vec_x2, vl);
626 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
627 x1_data += vl;
628 x2_data += vl;
629 y_data += vl;
630 n -= vl;
631 }
632 #else // scalar implementation
633 for (size_t i = 0; i < n; i += 1) {
634 y->data[i] = as_f16(as_f32(x1->data[i]) * as_f32(x2->data[i]));
635 }
636 #endif
637}
638
639
641 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
642
643 size_t n = y->shape[0];
644 float16_t *x_data = x->data;
645 float16_t *y_data = y->data;
646
647 #ifdef CONFIG_BACKEND_RISCV_ZVFH
648 while (n > 0) {
649 size_t vl = __riscv_vsetvl_e16m1(n);
650 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
651 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
652 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
653 x_data += vl;
654 y_data += vl;
655 n -= vl;
656 }
657 #else // scalar implementation
658 for (size_t i = 0; i < n; i += 1) {
659 y->data[i] = as_f16(as_f32(x->data[i]) * as_f32(scalar));
660 }
661 #endif
662}
663
664
665
667 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == x->shape[1], "Cannot add tensors of different shapes");
668
669 size_t n = y->shape[0] * y->shape[1];
670 float16_t *x_data = x->data;
671 float16_t *y_data = y->data;
672
673 #ifdef CONFIG_BACKEND_RISCV_ZVFH
674 while (n > 0) {
675 size_t vl = __riscv_vsetvl_e16m1(n);
676 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
677 vfloat16m1_t vec_y = __riscv_vfadd_vf_f16m1(vec_x, scalar, vl);
678 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
679 x_data += vl;
680 y_data += vl;
681 n -= vl;
682 }
683 #else // scalar implementation
684 for (size_t i = 0; i < n; i += 1) {
685 y->data[i] = as_f16(as_f32(x->data[i]) * as_f32(scalar));
686 }
687 #endif
688}
689
690
691
692/* ======================================================================================================== */
693/* MatMul */
694/* ======================================================================================================== */
695void nn_dot_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2) {
696 nn_assert(x1->shape[0] == x2->shape[0], "Cannot dot tensors of different shapes");
697 nn_assert(y->shape[0] == x1->shape[0], "Cannot dot tensors of different shapes");
698
699 size_t n = y->shape[0];
700 float sum_f32 = 0;
701 for (size_t i = 0; i < n; i += 1) {
702 sum_f32 += as_f32(x1->data[i]) * as_f32(x2->data[i]);
703 }
704 y->data[0] = as_f16(sum_f32);
705}
706
714void nn_mm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
715 nn_assert(x1->shape[1] == x2->shape[0], "Cannot perform MatMul on tensors of different shapes");
716 nn_assert(y->shape[0] == x1->shape[0] && y->shape[1] == x2->shape[1], "Cannot perform MatMul on tensors of different shapes");
717
718 const size_t n = x1->shape[0];
719 const size_t m = x1->shape[1];
720 const size_t p = x2->shape[1];
721
722 for (size_t i = 0; i < n; i += 1) {
723 #ifdef CONFIG_BACKEND_RISCV_ZVFH
724 float16_t *x1_row = x1->data + i * m;
725 float16_t *y_row = y->data + i * p;
726
727 size_t vlmax = __riscv_vsetvlmax_e16m1();
728 for (size_t j = 0; j < p; j += 1) {
729 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
730 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
731
732 float16_t *x1_ptr = x1_row;
733 float16_t *x2_ptr = x2->data + j;
734 size_t k = m;
735
736 while (k > 0) {
737 size_t vl = __riscv_vsetvl_e16m1(k);
738 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_ptr, vl);
739 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_ptr, p * sizeof(float16_t), vl);
740 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
741
742 x1_ptr += vl;
743 x2_ptr += vl * p;
744 k -= vl;
745 }
746
747 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
748 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
749 #else
750 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
751 #endif
752 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
753 }
754 #else
755 for (size_t j = 0; j < p; j += 1) {
756 float sum = 0.f;
757 for (size_t k = 0; k < m; k += 1) {
758 sum += as_f32(x1->data[i * m + k]) * as_f32(x2->data[k * p + j]);
759 }
760 y->data[i * p + j] = as_f16(sum);
761 }
762 #endif
763 }
764}
765
766
767void nn_addmm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *c, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2) {
768 nn_assert(x1->shape[1] == x2->shape[0], "Cannot perform Linear on tensors of different shapes");
769 nn_assert(y->shape[0] == c->shape[0] && y->shape[1] == x2->shape[1], "Cannot perform Linear on tensors of different shapes");
770
771 const size_t n = x1->shape[0];
772 const size_t m = x1->shape[1];
773 const size_t p = x2->shape[1];
774
775 for (size_t i = 0; i < n; i += 1) {
776 #ifdef CONFIG_BACKEND_RISCV_ZVFH
777 float16_t *x1_row = x1->data + i * m;
778 float16_t *y_row = y->data + i * p;
779
780 size_t vlmax = __riscv_vsetvlmax_e16m1();
781 for (size_t j = 0; j < p; j += 1) {
782 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
783 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
784
785 float16_t *x1_ptr = x1_row;
786 float16_t *x2_ptr = x2->data + j;
787 size_t k = m;
788
789 while (k > 0) {
790 size_t vl = __riscv_vsetvl_e16m1(k);
791 vfloat16m1_t vec_x1 = __riscv_vle16_v_f16m1(x1_ptr, vl);
792 vfloat16m1_t vec_x2 = __riscv_vlse16_v_f16m1(x2_ptr, p * sizeof(float16_t), vl);
793 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x1, vec_x2, vl);
794
795 x1_ptr += vl;
796 x2_ptr += vl * p;
797 k -= vl;
798 }
799
800 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
801 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
802 #else
803 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
804 #endif
805 y_row[j] = __riscv_vfmv_f_s_f16m1_f16(vec_sum) + c->data[i * p + j];
806 }
807
808 x1_row += m;
809 y_row += p;
810 #else
811 for (size_t j = 0; j < p; j += 1) {
812 float sum = 0.f;
813 for (size_t k = 0; k < m; k += 1) {
814 sum += as_f32(x1->data[i * m + k]) * as_f32(x2->data[k * p + j]);
815 }
816 y->data[i * p + j] = as_f16(sum + as_f32(c->data[i * p + j]));
817 }
818 #endif
819 }
820}
821
822
823void nn_linear_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, const Tensor2D_F16 *weight, const Tensor1D_F16 *bias) {
824 nn_assert(x->shape[1] == weight->shape[1], "Cannot perform Linear on tensors of different shapes");
825 nn_assert(!bias || bias->shape[0] == weight->shape[0], "Cannot perform Linear on tensors of different shapes");
826 nn_assert(y->shape[0] == x->shape[0] && y->shape[1] == weight->shape[0], "Cannot perform Linear on tensors of different shapes");
827
828 const size_t batch_size = x->shape[0];
829 const size_t in_features = x->shape[1];
830 const size_t out_features = weight->shape[0];
831
832 float16_t *x_batch_data = x->data;
833 float16_t *y_batch_data = y->data;
834
835 for (size_t i = 0; i < batch_size; i += 1) {
836 #ifdef CONFIG_BACKEND_RISCV_ZVFH
837 float16_t *x_data = x_batch_data;
838 float16_t *y_data = y_batch_data;
839
840 size_t vlmax = __riscv_vsetvlmax_e16m1();
841
842 for (size_t j = 0; j < out_features; j += 1) {
843 vfloat16m1_t vec_zero = __riscv_vfmv_v_f_f16m1(0, vlmax);
844 vfloat16m1_t vec_sum = __riscv_vfmv_v_f_f16m1(0, vlmax);
845
846 float16_t *weight_row = weight->data + j * in_features;
847 size_t n = in_features;
848
849 while (n > 0) {
850 size_t vl = __riscv_vsetvl_e16m1(n);
851 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
852 vfloat16m1_t vec_w = __riscv_vle16_v_f16m1(weight_row, vl);
853 vec_sum = __riscv_vfmacc_vv_f16m1(vec_sum, vec_x, vec_w, vl);
854
855 x_data += vl;
856 weight_row += vl;
857 n -= vl;
858 }
859
860 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
861 vec_sum = __riscv_vfredosum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
862 #else
863 vec_sum = __riscv_vfredusum_vs_f16m1_f16m1(vec_sum, vec_zero, vlmax);
864 #endif
865
866 float16_t sum = __riscv_vfmv_f_s_f16m1_f16(vec_sum);
867 if (bias) {
868 sum = as_f16(as_f32(sum) + as_f32(bias->data[j]));
869 }
870 y_data[j] = sum;
871 x_data = x_batch_data; // reset x_data pointer for next output feature
872 }
873
874 x_batch_data += in_features;
875 y_batch_data += out_features;
876 #else // scalar implementation
877 for (size_t j = 0; j < out_features; j += 1) {
878 float sum = 0.f;
879 for (size_t k = 0; k < in_features; k += 1) {
880 sum += as_f32(x->data[i * in_features + k]) * as_f32(weight->data[j * in_features + k]);
881 }
882 if (bias) {
883 sum += as_f32(bias->data[j]);
884 }
885 y->data[i * out_features + j] = as_f16(sum);
886 }
887 #endif
888 }
889}
890
891
892
893/* ======================================================================================================== */
894/* Non-linear */
895/* ======================================================================================================== */
896void nn_elu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float alpha) {
897 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ELU on tensors of different shapes");
898
899 const size_t n = y->shape[0] * y->shape[1];
900 for (size_t i = 0; i < n; i += 1) {
901 if (as_f32(x->data[i]) > 0) {
902 y->data[i] = x->data[i];
903 }
904 else {
905 y->data[i] = as_f16(alpha * (expf(as_f32(x->data[i])) - 1.f));
906 }
907 }
908}
909
910
912 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ReLU on tensors of different shapes");
913
914 size_t n = y->shape[0] * y->shape[1];
915 float16_t *x_data = x->data;
916 float16_t *y_data = y->data;
917
918 #ifdef CONFIG_BACKEND_RISCV_ZVFH
919 float16_t zero = 0.0f;
920
921 while (n > 0) {
922 size_t vl = __riscv_vsetvl_e16m1(n);
923 vfloat16m1_t vec_x = __riscv_vle16_v_f16m1(x_data, vl);
924 vfloat16m1_t vec_y = __riscv_vfmax_vf_f16m1(vec_x, zero, vl);
925 __riscv_vse16_v_f16m1(y_data, vec_y, vl);
926 x_data += vl;
927 y_data += vl;
928 n -= vl;
929 }
930 #else // scalar implementation
931 for (size_t i = 0; i < n; i += 1) {
932 float x_val = as_f32(x->data[i]);
933 y->data[i] = x_val > 0 ? as_f16(x_val) : 0;
934 }
935 #endif
936}
937
938
940 nn_assert(y->shape[0] == x->shape[0], "Cannot add tensors of different shapes");
941
942 size_t n = y->shape[0];
943 float sum = 0.0f;
944 for (size_t i = 0; i < n; i += 1) {
945 sum += expf(as_f32(x->data[i]));
946 }
947
948 for (size_t i = 0; i < n; i += 1) {
949 y->data[i] = as_f16(expf(as_f32(x->data[i])) / sum);
950 }
951}
952
953
954
956 nn_assert(x->shape[0] == y->shape[0] && x->shape[1] == y->shape[1], "Cannot perform ReLU on tensors of different shapes");
957
958 const size_t n = y->shape[0] * y->shape[1];
959 for (size_t i = 0; i < n; i += 1) {
960 float x_val = as_f32(x->data[i]);
961 y->data[i] = as_f16(tanh(x_val));
962 }
963}
964
965
966
967
968
969#endif // __NN_F32_H
static float16_t as_f16(float f)
Definition: float16.h:116
static float as_f32(float16_t h)
Definition: float16.h:50
#define FLT16_MAX
Definition: float16.h:27
_Float16 float16_t
Definition: float16.h:33
Half-Precision Floating-Point (fp16) Definitions.
void nn_print_f32(float v, int16_t num_digits)
Definition: nn.h:91
static void nn_assert(int condition, char *message)
Definition: nn.h:54
void nn_min2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:418
void nn_mulscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Definition: nn_f16.h:666
Tensor2D_F16 * nn_tensor2d_f16(size_t shape[2], const float16_t *data)
Definition: nn_f16.h:108
Tensor2D_F16 * nn_zeros2d_f16(size_t shape[2])
Definition: nn_f16.h:135
void nn_softmax1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x)
Definition: nn_f16.h:939
void nn_min1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Definition: nn_f16.h:393
void nn_print_f16(float16_t v, int16_t num_digits)
Definition: nn_f16.h:225
void nn_addscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Definition: nn_f16.h:526
void nn_mul1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Definition: nn_f16.h:582
void nn_addscalar2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float16_t scalar)
Definition: nn_f16.h:550
void nn_mul2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:611
void nn_tanh2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:955
Tensor0D_F16 * nn_zeros0d_f16()
Definition: nn_f16.h:121
static uint8_t nn_equal_f16(float16_t golden, float16_t actual, float rel_err)
Definition: nn_f16.h:59
Tensor1D_F16 * nn_full1d_f16(size_t shape[1], float16_t data)
Definition: nn_f16.h:172
Tensor2D_F16 * nn_full2d_f16(size_t shape[2], float16_t data)
Definition: nn_f16.h:181
void nn_print_tensor1d_f16(const Tensor1D_F16 *tensor)
Definition: nn_f16.h:238
Tensor0D_F16 * nn_rand0d_f16()
Definition: nn_f16.h:190
void nn_mm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:714
void nn_print_tensor2d_f16(const Tensor2D_F16 *tensor)
Definition: nn_f16.h:256
Tensor2D_F16 * nn_rand2d_f16(size_t shape[2])
Definition: nn_f16.h:204
Tensor2D_F16 * nn_ones2d_f16(size_t shape[2])
Definition: nn_f16.h:158
void nn_addmm_f16(Tensor2D_F16 *y, const Tensor2D_F16 *c, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:767
uint8_t nn_equals2d_f16(const Tensor2D_F16 *a, const Tensor2D_F16 *b, float rel_err)
Definition: nn_f16.h:327
uint8_t nn_equals0d_f16(const Tensor0D_F16 *a, const Tensor0D_F16 *b, float rel_err)
Definition: nn_f16.h:291
Tensor1D_F16 * nn_ones1d_f16(size_t shape[1])
Definition: nn_f16.h:149
Tensor1D_F16 * nn_zeros1d_f16(size_t shape[1])
Definition: nn_f16.h:126
void nn_add1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Definition: nn_f16.h:457
void nn_max2d_f16(Tensor0D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:368
void nn_mulscalar1d_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x, float16_t scalar)
Definition: nn_f16.h:640
void nn_elu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, float alpha)
Definition: nn_f16.h:896
void nn_dot_f16(Tensor1D_F16 *y, const Tensor1D_F16 *x1, const Tensor1D_F16 *x2)
Definition: nn_f16.h:695
void nn_linear_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x, const Tensor2D_F16 *weight, const Tensor1D_F16 *bias)
Definition: nn_f16.h:823
void nn_add2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x1, const Tensor2D_F16 *x2)
Definition: nn_f16.h:498
void nn_relu2d_f16(Tensor2D_F16 *y, const Tensor2D_F16 *x)
Definition: nn_f16.h:911
void nn_max1d_f16(Tensor0D_F16 *y, const Tensor1D_F16 *x)
Definition: nn_f16.h:344
uint8_t nn_equals1d_f16(const Tensor1D_F16 *a, const Tensor1D_F16 *b, float rel_err)
Definition: nn_f16.h:305
Tensor1D_F16 * nn_tensor1d_f16(size_t shape[1], const float16_t *data)
Definition: nn_f16.h:88
Tensor0D_F16 * nn_ones0d_f16()
Definition: nn_f16.h:144
Tensor0D_F16 * nn_full0d_f16(float16_t data)
Definition: nn_f16.h:167
Tensor0D_F16 * nn_tensor0d_f16(float16_t data)
Definition: nn_f16.h:74
Tensor1D_F16 * nn_rand1d_f16(size_t shape[1])
Definition: nn_f16.h:195
float16_t data
Definition: nn_f16.h:24
Definition: nn_f16.h:23
size_t shape[1]
Definition: nn_f16.h:34
float16_t * data
Definition: nn_f16.h:35
Definition: nn_f16.h:33
size_t shape[2]
Definition: nn_f16.h:45
float16_t * data
Definition: nn_f16.h:46
Definition: nn_f16.h:44