13#ifdef CONFIG_BACKEND_RISCV_V
14 #include "riscv_vector.h"
67static inline uint8_t
nn_equal_f32(
float golden,
float actual,
float rel_err) {
68 return (fabs(actual - golden) < rel_err) || (fabs((actual - golden) / actual) < rel_err);
99 tensor->
shape[0] = shape[0];
101 size_t n_bytes = shape[0] *
sizeof(float);
102 tensor->
data = (
float *)malloc(n_bytes);
104 memcpy(tensor->
data, data, n_bytes);
119 tensor->
shape[0] = shape[0];
120 tensor->
shape[1] = shape[1];
122 size_t n_bytes = shape[0] * shape[1] *
sizeof(float);
123 tensor->
data = (
float *)malloc(n_bytes);
125 memcpy(tensor->
data, data, n_bytes);
140 tensor->
shape[0] = shape[0];
141 tensor->
shape[1] = shape[1];
142 tensor->
shape[2] = shape[2];
144 size_t n_bytes = shape[0] * shape[1] * shape[2] *
sizeof(float);
145 tensor->
data = (
float *)malloc(n_bytes);
147 memcpy(tensor->
data, data, n_bytes);
162 tensor->
shape[0] = shape[0];
163 tensor->
shape[1] = shape[1];
164 tensor->
shape[2] = shape[2];
165 tensor->
shape[3] = shape[3];
167 size_t n_bytes = shape[0] * shape[1] * shape[2] * shape[3] *
sizeof(float);
168 tensor->
data = (
float *)malloc(n_bytes);
170 memcpy(tensor->
data, data, n_bytes);
183 tensor->
shape[0] = shape[0];
196 tensor->
shape[0] = shape[0];
197 tensor->
shape[1] = shape[1];
210 tensor->
shape[0] = shape[0];
211 tensor->
shape[1] = shape[1];
212 tensor->
shape[2] = shape[2];
225 tensor->
shape[0] = shape[0];
226 tensor->
shape[1] = shape[1];
227 tensor->
shape[2] = shape[2];
228 tensor->
shape[3] = shape[3];
248 for (
size_t i = 0; i < n; i += 1) {
259 size_t n = shape[0] * shape[1];
260 for (
size_t i = 0; i < n; i += 1) {
271 size_t n = shape[0] * shape[1] * shape[2];
272 for (
size_t i = 0; i < n; i += 1) {
283 size_t n = shape[0] * shape[1] * shape[2] * shape[3];
284 for (
size_t i = 0; i < n; i += 1) {
304 for (
size_t i = 0; i < n; i += 1) {
315 size_t n = shape[0] * shape[1];
316 for (
size_t i = 0; i < n; i += 1) {
336 for (
size_t i = 0; i < n; i += 1) {
337 tensor->
data[i] = data;
347 size_t n = shape[0] * shape[1];
348 for (
size_t i = 0; i < n; i += 1) {
349 tensor->
data[i] = data;
368 for (
size_t i = 0; i < n; i += 1) {
369 tensor->
data[i] = rand();
379 size_t n = shape[0] * shape[1];
380 for (
size_t i = 0; i < n; i += 1) {
381 tensor->
data[i] = rand();
398 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
400 if (i < tensor->shape[0]-1) {
414 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
421 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
423 if (j < tensor->shape[1]-1) {
428 if (i < tensor->shape[0]-1) {
442 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
449 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
456 for (
size_t k=0; k<tensor->
shape[2]; k+=1) {
458 if (k < tensor->shape[2]-1) {
465 if (i < tensor->shape[0]-1) {
479 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
486 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
493 for (
size_t k=0; k<tensor->
shape[2]; k+=1) {
500 for (
size_t l=0; l<tensor->
shape[3]; l+=1) {
502 if (l < tensor->shape[3]-1) {
507 if (k < tensor->shape[2]-1) {
512 if (j < tensor->shape[1]-1) {
517 if (i < tensor->shape[0]-1) {
551 size_t n = a->
shape[0];
552 for (
size_t i = 0; i < n; i += 1) {
572 for (
size_t i = 0; i < n; i += 1) {
592 for (
size_t i = 0; i < n; i += 1) {
612 for (
size_t i = 0; i < n; i += 1) {
625 size_t n = x->
shape[0];
626 float *x_data = x->
data;
628 #ifdef CONFIG_BACKEND_RISCV_V
629 vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1);
632 size_t vl = __riscv_vsetvl_e32m1(n);
633 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
634 vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl);
638 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_max);
641 for (
size_t i = 0; i < n; i += 1) {
642 float val = x->
data[i];
650 float *x_data = x->
data;
652 #ifdef CONFIG_BACKEND_RISCV_V
653 vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1);
656 size_t vl = __riscv_vsetvl_e32m1(n);
657 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
658 vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl);
662 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_max);
665 for (
size_t i = 0; i < n; i += 1) {
666 float val = x->
data[i];
673 size_t n = x->
shape[0];
674 float *x_data = x->
data;
676 #ifdef CONFIG_BACKEND_RISCV_V
677 vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1);
680 size_t vl = __riscv_vsetvl_e32m1(n);
681 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
682 vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl);
686 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_min);
689 for (
size_t i = 0; i < n; i += 1) {
690 float val = x->
data[i];
698 float *x_data = x->
data;
700 #ifdef CONFIG_BACKEND_RISCV_V
701 vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1);
704 size_t vl = __riscv_vsetvl_e32m1(n);
705 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
706 vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl);
710 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_min);
713 for (
size_t i = 0; i < n; i += 1) {
714 float val = x->
data[i];
738 size_t n = y->
shape[0];
739 float *x1_data = x1->
data;
740 float *x2_data = x2->
data;
741 float *y_data = y->
data;
743 #ifdef CONFIG_BACKEND_RISCV_V
745 size_t vl = __riscv_vsetvl_e32m1(n);
746 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
747 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
748 vfloat32m1_t vec_y = __riscv_vfadd_vv_f32m1(vec_x1, vec_x2, vl);
749 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
756 for (
size_t i = 0; i < n; i += 1) {
778 float *x1_data = x1->
data;
779 float *x2_data = x2->
data;
780 float *y_data = y->
data;
782 #ifdef CONFIG_BACKEND_RISCV_V
784 size_t vl = __riscv_vsetvl_e32m1(n);
785 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
786 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
787 vfloat32m1_t vec_y = __riscv_vfadd_vv_f32m1(vec_x1, vec_x2, vl);
788 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
795 for (
size_t i = 0; i < n; i += 1) {
804 size_t n = y->
shape[0];
805 float *x_data = x->
data;
806 float *y_data = y->
data;
808 #ifdef CONFIG_BACKEND_RISCV_V
810 size_t vl = __riscv_vsetvl_e32m1(n);
811 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
812 vfloat32m1_t vec_y = __riscv_vfadd_vf_f32m1(vec_x, scalar, vl);
813 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
819 for (
size_t i = 0; i < n; i += 1) {
829 float *x_data = x->
data;
830 float *y_data = y->
data;
832 #ifdef CONFIG_BACKEND_RISCV_V
834 size_t vl = __riscv_vsetvl_e32m1(n);
835 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
836 vfloat32m1_t vec_y = __riscv_vfadd_vf_f32m1(vec_x, scalar, vl);
837 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
843 for (
size_t i = 0; i < n; i += 1) {
858 size_t n = y->
shape[0];
859 float *x1_data = x1->
data;
860 float *x2_data = x2->
data;
861 float *y_data = y->
data;
863 #ifdef CONFIG_BACKEND_RISCV_V
865 size_t vl = __riscv_vsetvl_e32m1(n);
866 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
867 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
868 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
869 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
876 for (
size_t i = 0; i < n; i += 1) {
887 float *x1_data = x1->
data;
888 float *x2_data = x2->
data;
889 float *y_data = y->
data;
891 #ifdef CONFIG_BACKEND_RISCV_V
893 size_t vl = __riscv_vsetvl_e32m1(n);
894 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
895 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
896 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
897 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
904 for (
size_t i = 0; i < n; i += 1) {
913 size_t n = y->
shape[0];
914 float *x_data = x->
data;
915 float *y_data = y->
data;
917 #ifdef CONFIG_BACKEND_RISCV_V
919 size_t vl = __riscv_vsetvl_e32m1(n);
920 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
921 vfloat32m1_t vec_y = __riscv_vfmul_vf_f32m1(vec_x, scalar, vl);
922 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
928 for (
size_t i = 0; i < n; i += 1) {
938 float *x_data = x->
data;
939 float *y_data = y->
data;
941 #ifdef CONFIG_BACKEND_RISCV_V
943 size_t vl = __riscv_vsetvl_e32m1(n);
944 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
945 vfloat32m1_t vec_y = __riscv_vfmul_vf_f32m1(vec_x, scalar, vl);
946 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
952 for (
size_t i = 0; i < n; i += 1) {
966 size_t n = y->
shape[0];
967 float *x1_data = x1->
data;
968 float *x2_data = x2->
data;
969 float *y_data = y->
data;
971 #ifdef CONFIG_BACKEND_RISCV_V
973 size_t vl = __riscv_vsetvl_e32m1(n);
974 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
975 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
976 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
977 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
985 for (
size_t i = 0; i < n; i += 1) {
1000 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform MatMul on tensors of different shapes");
1003 const size_t n = x1->
shape[0];
1004 const size_t m = x1->
shape[1];
1005 const size_t p = x2->
shape[1];
1007 for (
size_t i = 0; i < n; i += 1) {
1008 #ifdef CONFIG_BACKEND_RISCV_V
1009 float *x1_row = x1->
data + i * m;
1010 float *y_row = y->
data + i * p;
1012 size_t vlmax = __riscv_vsetvlmax_e32m1();
1013 for (
size_t j = 0; j < p; j += 1) {
1014 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1015 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1017 float *x1_ptr = x1_row;
1018 float *x2_ptr = x2->
data + j;
1022 size_t vl = __riscv_vsetvl_e32m1(k);
1023 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_ptr, vl);
1024 vfloat32m1_t vec_x2 = __riscv_vlse32_v_f32m1(x2_ptr, p *
sizeof(
float), vl);
1025 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x1, vec_x2, vl);
1032 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1033 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1035 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1037 y_row[j] = __riscv_vfmv_f_s_f32m1_f32(vec_sum);
1040 for (
size_t j = 0; j < p; j += 1) {
1042 for (
size_t k = 0; k < m; k += 1) {
1043 sum += x1->
data[i * m + k] * x2->
data[k * p + j];
1045 y->
data[i * p + j] = sum;
1053 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform Linear on tensors of different shapes");
1056 const size_t n = x1->
shape[0];
1057 const size_t m = x1->
shape[1];
1058 const size_t p = x2->
shape[1];
1060 for (
size_t i = 0; i < n; i += 1) {
1061 #ifdef CONFIG_BACKEND_RISCV_V
1062 float *x1_row = x1->
data + i * m;
1063 float *y_row = y->
data + i * p;
1065 size_t vlmax = __riscv_vsetvlmax_e32m1();
1066 for (
size_t j = 0; j < p; j += 1) {
1067 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1068 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1070 float *x1_ptr = x1_row;
1071 float *x2_ptr = x2->
data + j;
1075 size_t vl = __riscv_vsetvl_e32m1(k);
1076 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_ptr, vl);
1077 vfloat32m1_t vec_x2 = __riscv_vlse32_v_f32m1(x2_ptr, p *
sizeof(
float), vl);
1078 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x1, vec_x2, vl);
1085 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1086 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1088 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1090 y_row[j] = __riscv_vfmv_f_s_f32m1_f32(vec_sum) + c->
data[i * p + j];
1096 for (
size_t j = 0; j < p; j += 1) {
1098 for (
size_t k = 0; k < m; k += 1) {
1099 sum += x1->
data[i * m + k] * x2->
data[k * p + j];
1101 y->
data[i * p + j] = sum + c->
data[i * p + j];
1108 nn_assert(x->
shape[1] == weight->
shape[1],
"Cannot perform Linear on tensors of different shapes");
1109 nn_assert(!bias || bias->
shape[0] == weight->
shape[0],
"Cannot perform Linear on tensors of different shapes");
1112 const size_t batch_size = x->
shape[0];
1113 const size_t in_features = x->
shape[1];
1114 const size_t out_features = weight->
shape[0];
1116 float *x_batch_data = x->
data;
1117 float *y_batch_data = y->
data;
1119 for (
size_t i = 0; i < batch_size; i += 1) {
1120 #ifdef CONFIG_BACKEND_RISCV_V
1121 float *x_data = x_batch_data;
1122 float *y_data = y_batch_data;
1124 size_t vlmax = __riscv_vsetvlmax_e32m1();
1126 for (
size_t j = 0; j < out_features; j += 1) {
1127 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1128 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1130 float *weight_row = weight->
data + j * in_features;
1131 size_t n = in_features;
1134 size_t vl = __riscv_vsetvl_e32m1(n);
1135 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1136 vfloat32m1_t vec_w = __riscv_vle32_v_f32m1(weight_row, vl);
1137 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x, vec_w, vl);
1144 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1145 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1147 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1150 float sum = __riscv_vfmv_f_s_f32m1_f32(vec_sum);
1152 sum += bias->
data[j];
1155 x_data = x_batch_data;
1158 x_batch_data += in_features;
1159 y_batch_data += out_features;
1161 for (
size_t j = 0; j < out_features; j += 1) {
1163 for (
size_t k = 0; k < in_features; k += 1) {
1164 sum += x->
data[i * in_features + k] * weight->
data[j * in_features + k];
1167 sum += bias->
data[j];
1169 y->
data[i * out_features + j] = sum;
1184 for (
size_t i = 0; i < n; i += 1) {
1185 if (x->
data[i] > 0) {
1189 y->
data[i] = alpha * (expf(x->
data[i]) - 1.f);
1198 float *x_data = x->
data;
1199 float *y_data = y->
data;
1201 #ifdef CONFIG_BACKEND_RISCV_V
1205 size_t vl = __riscv_vsetvl_e32m1(n);
1206 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1207 vfloat32m1_t vec_y = __riscv_vfmax_vf_f32m1(vec_x, zero, vl);
1208 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1214 for (
size_t i = 0; i < n; i += 1) {
1215 float x_val = x->
data[i];
1216 y->
data[i] = x_val > 0 ? x_val : 0;
1225 size_t n = y->
shape[0];
1227 for (
size_t i = 0; i < n; i += 1) {
1228 sum += expf(x->
data[i]);
1231 for (
size_t i = 0; i < n; i += 1) {
1232 y->
data[i] = expf(x->
data[i]) / sum;
1239 float *y_data = y->
data;
1240 float *x_data = x->
data;
1243 for (
size_t i = 0; i < y->
shape[1]; i += 1) {
1244 size_t n = y->
shape[0];
1245 size_t m = y->
shape[1];
1247 for (
size_t j = 0; j < n; j += 1) {
1248 sum += expf(x_data[j * m]);
1251 for (
size_t j = 0; j < n; j += 1) {
1252 y_data[j * m] = expf(x_data[j * m]) / sum;
1259 else if (dim == 1) {
1261 for (
size_t i = 0; i < y->
shape[0]; i += 1) {
1262 size_t n = y->
shape[1];
1264 for (
size_t j = 0; j < n; j += 1) {
1265 sum += expf(x_data[j]);
1268 for (
size_t j = 0; j < n; j += 1) {
1269 y_data[j] = expf(x_data[j]) / sum;
1277 nn_assert(0,
"Invalid dimension for softmax");
1287 for (
size_t i = 0; i < n; i += 1) {
1288 float x_val = x->
data[i];
1289 y->
data[i] = tanh(x_val);
1311 nn_assert(query->
shape[1] == key->
shape[1] && query->
shape[1] == value->
shape[1],
"Query, key, and value must have the same sequence length");
1313 nn_assert(query->
shape[3] == key->
shape[3] && query->
shape[3] == value->
shape[3],
"Query, key, and value must have the same embedding dimension");
1316 size_t n = query->
shape[0];
1317 size_t h = query->
shape[1];
1318 size_t s = key->
shape[2];
1319 size_t l = query->
shape[2];
1320 size_t e = query->
shape[3];
1321 size_t ev = value->
shape[3];
1324 float scale_factor = 1 / sqrt(query->
shape[3]);
1339 size_t attn_weight_dims[4] = {n, h, l, s};
1342 size_t query_head_dims[2] = {l, e};
1343 size_t key_head_dims[2] = {l, e};
1344 size_t attn_weight_head_dims[2] = {l, s};
1345 size_t value_head_dims[2] = {s, ev};
1346 size_t y_head_dims[2] = {l, ev};
1348 for (
size_t head = 0; head < h; head += 1) {
1356 nn_linear_f32(attn_weight_head, query_head, key_head, NULL);
1367 nn_mm_f32(y_head, attn_weight_head, value_head);
void nn_print_f32(float v, int16_t num_digits)
Definition: nn.h:91
static void nn_assert(int condition, char *message)
Definition: nn.h:54
void nn_linear_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, const Tensor2D_F32 *weight, const Tensor1D_F32 *bias)
Definition: nn_f32.h:1107
Tensor3D_F32 * nn_zeros3d_f32(size_t shape[3])
Definition: nn_f32.h:269
void nn_softmax1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x)
Definition: nn_f32.h:1222
Tensor1D_F32 * nn_zeros1d_f32(size_t shape[1])
Definition: nn_f32.h:245
void nn_print_tensor4d_f32(const Tensor4D_F32 *tensor)
Definition: nn_f32.h:477
uint8_t nn_equals0d_f32(const Tensor0D_F32 *a, const Tensor0D_F32 *b, float rel_err)
Definition: nn_f32.h:536
Tensor0D_F32 * nn_full0d_f32(float data)
Definition: nn_f32.h:325
void nn_mm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Definition: nn_f32.h:999
Tensor2D_F32 * nn_rand2d_f32(size_t shape[2])
Definition: nn_f32.h:377
void nn_mulscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar)
Definition: nn_f32.h:934
void nn_addscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar)
Definition: nn_f32.h:825
Tensor2D_F32 * nn_full2d_f32(size_t shape[2], float data)
Definition: nn_f32.h:345
uint8_t nn_equals3d_f32(const Tensor3D_F32 *a, const Tensor3D_F32 *b, float rel_err)
Definition: nn_f32.h:588
void nn_max2d_f32(Tensor0D_F32 *y, const Tensor2D_F32 *x)
Definition: nn_f32.h:648
Tensor3D_F32 * nn_tensor3d_f32(size_t shape[3], const float *data)
Definition: nn_f32.h:138
void nn_relu2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x)
Definition: nn_f32.h:1194
Tensor2D_F32 * nn_tensor2d_f32(size_t shape[2], const float *data)
Definition: nn_f32.h:117
void nn_add2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Definition: nn_f32.h:773
void nn_scaled_dot_product_attention_f32(Tensor4D_F32 *y, const Tensor4D_F32 *query, const Tensor4D_F32 *key, const Tensor4D_F32 *value)
Definition: nn_f32.h:1309
void nn_dot_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Definition: nn_f32.h:962
void nn_addmm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *c, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Definition: nn_f32.h:1052
Tensor1D_F32 * nn_ones1d_f32(size_t shape[1])
Definition: nn_f32.h:301
void nn_min2d_f32(Tensor0D_F32 *y, const Tensor2D_F32 *x)
Definition: nn_f32.h:696
Tensor0D_F32 * nn_ones0d_f32()
Definition: nn_f32.h:293
static uint8_t nn_equal_f32(float golden, float actual, float rel_err)
Definition: nn_f32.h:67
Tensor4D_F32 * nn_tensor4d_f32(size_t shape[4], const float *data)
Definition: nn_f32.h:160
void nn_elu2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float alpha)
Definition: nn_f32.h:1180
void nn_print_tensor3d_f32(const Tensor3D_F32 *tensor)
Definition: nn_f32.h:440
void nn_addscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar)
Definition: nn_f32.h:801
void nn_tanh2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x)
Definition: nn_f32.h:1283
Tensor2D_F32 * nn_as_tensor2d_f32(size_t shape[2], float *data)
Definition: nn_f32.h:194
Tensor1D_F32 * nn_rand1d_f32(size_t shape[1])
Definition: nn_f32.h:365
void nn_mul1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Definition: nn_f32.h:854
void nn_softmax2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, size_t dim)
Definition: nn_f32.h:1236
uint8_t nn_equals4d_f32(const Tensor4D_F32 *a, const Tensor4D_F32 *b, float rel_err)
Definition: nn_f32.h:608
void nn_mulscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar)
Definition: nn_f32.h:910
void nn_add1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Definition: nn_f32.h:734
Tensor2D_F32 * nn_zeros2d_f32(size_t shape[2])
Definition: nn_f32.h:257
uint8_t nn_equals1d_f32(const Tensor1D_F32 *a, const Tensor1D_F32 *b, float rel_err)
Definition: nn_f32.h:548
void nn_print_tensor2d_f32(const Tensor2D_F32 *tensor)
Definition: nn_f32.h:412
void nn_min1d_f32(Tensor0D_F32 *y, const Tensor1D_F32 *x)
Definition: nn_f32.h:672
Tensor0D_F32 * nn_rand0d_f32()
Definition: nn_f32.h:357
Tensor1D_F32 * nn_as_tensor1d_f32(size_t shape[1], float *data)
Definition: nn_f32.h:181
void nn_mul2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Definition: nn_f32.h:882
Tensor3D_F32 * nn_as_tensor3d_f32(size_t shape[3], float *data)
Definition: nn_f32.h:208
Tensor1D_F32 * nn_full1d_f32(size_t shape[1], float data)
Definition: nn_f32.h:333
Tensor0D_F32 * nn_zeros0d_f32()
Definition: nn_f32.h:237
uint8_t nn_equals2d_f32(const Tensor2D_F32 *a, const Tensor2D_F32 *b, float rel_err)
Definition: nn_f32.h:568
Tensor0D_F32 * nn_tensor0d_f32(float data)
Definition: nn_f32.h:83
Tensor4D_F32 * nn_as_tensor4d_f32(size_t shape[4], float *data)
Definition: nn_f32.h:223
void nn_print_tensor1d_f32(const Tensor1D_F32 *tensor)
Definition: nn_f32.h:396
Tensor2D_F32 * nn_ones2d_f32(size_t shape[2])
Definition: nn_f32.h:313
void nn_max1d_f32(Tensor0D_F32 *y, const Tensor1D_F32 *x)
Definition: nn_f32.h:624
Tensor1D_F32 * nn_tensor1d_f32(size_t shape[1], const float *data)
Definition: nn_f32.h:97
Tensor4D_F32 * nn_zeros4d_f32(size_t shape[4])
Definition: nn_f32.h:281
float data
Definition: nn_f32.h:21
size_t shape[1]
Definition: nn_f32.h:29
float * data
Definition: nn_f32.h:30
float * data
Definition: nn_f32.h:39
size_t shape[2]
Definition: nn_f32.h:38
size_t shape[3]
Definition: nn_f32.h:46
float * data
Definition: nn_f32.h:47
size_t shape[4]
Definition: nn_f32.h:54
float * data
Definition: nn_f32.h:55