14#ifdef CONFIG_BACKEND_RISCV_V
15 #include "riscv_vector.h"
80static inline uint8_t
nn_equal_f32(
float golden,
float actual,
float rel_err) {
81 return (fabs(actual - golden) < rel_err) || (fabs((actual - golden) / actual) < rel_err);
116 tensor->
shape[0] = shape[0];
118 size_t n_bytes = shape[0] *
sizeof(float);
119 tensor->
data = (
float *)malloc(n_bytes);
121 memcpy(tensor->
data, data, n_bytes);
138 tensor->
shape[0] = shape[0];
139 tensor->
shape[1] = shape[1];
141 size_t n_bytes = shape[0] * shape[1] *
sizeof(float);
142 tensor->
data = (
float *)malloc(n_bytes);
144 memcpy(tensor->
data, data, n_bytes);
161 tensor->
shape[0] = shape[0];
162 tensor->
shape[1] = shape[1];
163 tensor->
shape[2] = shape[2];
165 size_t n_bytes = shape[0] * shape[1] * shape[2] *
sizeof(float);
166 tensor->
data = (
float *)malloc(n_bytes);
168 memcpy(tensor->
data, data, n_bytes);
185 tensor->
shape[0] = shape[0];
186 tensor->
shape[1] = shape[1];
187 tensor->
shape[2] = shape[2];
188 tensor->
shape[3] = shape[3];
190 size_t n_bytes = shape[0] * shape[1] * shape[2] * shape[3] *
sizeof(float);
191 tensor->
data = (
float *)malloc(n_bytes);
193 memcpy(tensor->
data, data, n_bytes);
208 tensor->
shape[0] = shape[0];
223 tensor->
shape[0] = shape[0];
224 tensor->
shape[1] = shape[1];
239 tensor->
shape[0] = shape[0];
240 tensor->
shape[1] = shape[1];
241 tensor->
shape[2] = shape[2];
256 tensor->
shape[0] = shape[0];
257 tensor->
shape[1] = shape[1];
258 tensor->
shape[2] = shape[2];
259 tensor->
shape[3] = shape[3];
285 for (
size_t i = 0; i < n; i += 1) {
300 size_t n = shape[0] * shape[1];
301 for (
size_t i = 0; i < n; i += 1) {
316 size_t n = shape[0] * shape[1] * shape[2];
317 for (
size_t i = 0; i < n; i += 1) {
332 size_t n = shape[0] * shape[1] * shape[2] * shape[3];
333 for (
size_t i = 0; i < n; i += 1) {
359 for (
size_t i = 0; i < n; i += 1) {
374 size_t n = shape[0] * shape[1];
375 for (
size_t i = 0; i < n; i += 1) {
404 for (
size_t i = 0; i < n; i += 1) {
405 tensor->
data[i] = data;
420 size_t n = shape[0] * shape[1];
421 for (
size_t i = 0; i < n; i += 1) {
422 tensor->
data[i] = data;
447 for (
size_t i = 0; i < n; i += 1) {
448 tensor->
data[i] = rand();
462 size_t n = shape[0] * shape[1];
463 for (
size_t i = 0; i < n; i += 1) {
464 tensor->
data[i] = rand();
483 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
485 if (i < tensor->shape[0]-1) {
501 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
508 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
510 if (j < tensor->shape[1]-1) {
515 if (i < tensor->shape[0]-1) {
531 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
538 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
545 for (
size_t k=0; k<tensor->
shape[2]; k+=1) {
547 if (k < tensor->shape[2]-1) {
554 if (i < tensor->shape[0]-1) {
570 for (
size_t i=0; i<tensor->
shape[0]; i+=1) {
577 for (
size_t j=0; j<tensor->
shape[1]; j+=1) {
584 for (
size_t k=0; k<tensor->
shape[2]; k+=1) {
591 for (
size_t l=0; l<tensor->
shape[3]; l+=1) {
593 if (l < tensor->shape[3]-1) {
598 if (k < tensor->shape[2]-1) {
603 if (j < tensor->shape[1]-1) {
608 if (i < tensor->shape[0]-1) {
646 size_t n = a->
shape[0];
647 for (
size_t i = 0; i < n; i += 1) {
669 for (
size_t i = 0; i < n; i += 1) {
691 for (
size_t i = 0; i < n; i += 1) {
713 for (
size_t i = 0; i < n; i += 1) {
735 size_t n = x->
shape[0];
736 float *x_data = x->
data;
738 #ifdef CONFIG_BACKEND_RISCV_V
739 vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1);
742 size_t vl = __riscv_vsetvl_e32m1(n);
743 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
744 vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl);
748 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_max);
751 for (
size_t i = 0; i < n; i += 1) {
752 float val = x_data[i];
768 float *x_data = x->
data;
770 #ifdef CONFIG_BACKEND_RISCV_V
771 vfloat32m1_t vec_max = __riscv_vfmv_s_f_f32m1(-FLT_MAX, 1);
774 size_t vl = __riscv_vsetvl_e32m1(n);
775 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
776 vec_max = __riscv_vfredmax_vs_f32m1_f32m1(vec_x, vec_max, vl);
780 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_max);
783 for (
size_t i = 0; i < n; i += 1) {
784 float val = x_data[i];
799 size_t n = x->
shape[0];
800 float *x_data = x->
data;
802 #ifdef CONFIG_BACKEND_RISCV_V
803 vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1);
806 size_t vl = __riscv_vsetvl_e32m1(n);
807 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
808 vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl);
812 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_min);
815 for (
size_t i = 0; i < n; i += 1) {
816 float val = x_data[i];
832 float *x_data = x->
data;
834 #ifdef CONFIG_BACKEND_RISCV_V
835 vfloat32m1_t vec_min = __riscv_vfmv_s_f_f32m1(FLT_MAX, 1);
838 size_t vl = __riscv_vsetvl_e32m1(n);
839 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
840 vec_min = __riscv_vfredmin_vs_f32m1_f32m1(vec_x, vec_min, vl);
844 y->
data = __riscv_vfmv_f_s_f32m1_f32(vec_min);
847 for (
size_t i = 0; i < n; i += 1) {
848 float val = x_data[i];
872 size_t n = y->
shape[0];
873 float *x1_data = x1->
data;
874 float *x2_data = x2->
data;
875 float *y_data = y->
data;
877 #ifdef CONFIG_BACKEND_RISCV_V
879 size_t vl = __riscv_vsetvl_e32m1(n);
880 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
881 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
882 vfloat32m1_t vec_y = __riscv_vfadd_vv_f32m1(vec_x1, vec_x2, vl);
883 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
890 for (
size_t i = 0; i < n; i += 1) {
891 y_data[i] = x1_data[i] + x2_data[i];
912 float *x1_data = x1->
data;
913 float *x2_data = x2->
data;
914 float *y_data = y->
data;
916 #ifdef CONFIG_BACKEND_RISCV_V
918 size_t vl = __riscv_vsetvl_e32m1(n);
919 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
920 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
921 vfloat32m1_t vec_y = __riscv_vfadd_vv_f32m1(vec_x1, vec_x2, vl);
922 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
929 for (
size_t i = 0; i < n; i += 1) {
930 y_data[i] = x1_data[i] + x2_data[i];
949 size_t n = y->
shape[0];
950 float *x_data = x->
data;
951 float *y_data = y->
data;
953 #ifdef CONFIG_BACKEND_RISCV_V
955 size_t vl = __riscv_vsetvl_e32m1(n);
956 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
957 vfloat32m1_t vec_y = __riscv_vfadd_vf_f32m1(vec_x, scalar, vl);
958 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
964 for (
size_t i = 0; i < n; i += 1) {
965 y_data[i] = x_data[i] + scalar;
985 float *x_data = x->
data;
986 float *y_data = y->
data;
988 #ifdef CONFIG_BACKEND_RISCV_V
990 size_t vl = __riscv_vsetvl_e32m1(n);
991 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
992 vfloat32m1_t vec_y = __riscv_vfadd_vf_f32m1(vec_x, scalar, vl);
993 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
999 for (
size_t i = 0; i < n; i += 1) {
1000 y_data[i] = x_data[i] + scalar;
1024 size_t n = y->
shape[0];
1025 float *x1_data = x1->
data;
1026 float *x2_data = x2->
data;
1027 float *y_data = y->
data;
1029 #ifdef CONFIG_BACKEND_RISCV_V
1031 size_t vl = __riscv_vsetvl_e32m1(n);
1032 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
1033 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
1034 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
1035 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1042 for (
size_t i = 0; i < n; i += 1) {
1043 y_data[i] = x1_data[i] * x2_data[i];
1064 float *x1_data = x1->
data;
1065 float *x2_data = x2->
data;
1066 float *y_data = y->
data;
1068 #ifdef CONFIG_BACKEND_RISCV_V
1070 size_t vl = __riscv_vsetvl_e32m1(n);
1071 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
1072 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
1073 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
1074 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1081 for (
size_t i = 0; i < n; i += 1) {
1082 y_data[i] = x1_data[i] * x2_data[i];
1101 size_t n = y->
shape[0];
1102 float *x_data = x->
data;
1103 float *y_data = y->
data;
1105 #ifdef CONFIG_BACKEND_RISCV_V
1107 size_t vl = __riscv_vsetvl_e32m1(n);
1108 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1109 vfloat32m1_t vec_y = __riscv_vfmul_vf_f32m1(vec_x, scalar, vl);
1110 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1116 for (
size_t i = 0; i < n; i += 1) {
1117 y_data[i] = x_data[i] * scalar;
1137 float *x_data = x->
data;
1138 float *y_data = y->
data;
1140 #ifdef CONFIG_BACKEND_RISCV_V
1142 size_t vl = __riscv_vsetvl_e32m1(n);
1143 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1144 vfloat32m1_t vec_y = __riscv_vfmul_vf_f32m1(vec_x, scalar, vl);
1145 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1151 for (
size_t i = 0; i < n; i += 1) {
1152 y_data[i] = x_data[i] * scalar;
1171 nn_assert(out->
shape[0] == in->
shape[0],
"Cannot convert between tensors of different shapes");
1172 nn_assert(out->
shape[1] == in->
shape[2],
"Cannot convert between tensors of different shapes");
1173 nn_assert(out->
shape[2] == in->
shape[3],
"Cannot convert between tensors of different shapes");
1174 nn_assert(out->
shape[3] == in->
shape[1],
"Cannot convert between tensors of different shapes");
1176 size_t batch_size = in->
shape[0];
1177 size_t height = in->
shape[2];
1178 size_t width = in->
shape[3];
1179 size_t channels = in->
shape[1];
1181 for (
size_t n = 0; n < batch_size; n += 1) {
1182 for (
size_t c = 0; c < channels; c += 1) {
1183 for (
size_t h = 0; h < height; h += 1) {
1184 for (
size_t w = 0; w < width; w += 1) {
1185 size_t nchw_index = n * channels * height * width + c * height * width + h * width + w;
1186 size_t nhwc_index = n * height * width * channels + h * width * channels + w * channels + c;
1187 ((
float *)out->
data)[nhwc_index] = ((
float *)in->
data)[nchw_index];
1204 nn_assert(out->
shape[0] == in->
shape[0],
"Cannot convert between tensors of different shapes");
1205 nn_assert(out->
shape[1] == in->
shape[3],
"Cannot convert between tensors of different shapes");
1206 nn_assert(out->
shape[2] == in->
shape[1],
"Cannot convert between tensors of different shapes");
1207 nn_assert(out->
shape[3] == in->
shape[2],
"Cannot convert between tensors of different shapes");
1209 size_t batch_size = in->
shape[0];
1210 size_t height = in->
shape[1];
1211 size_t width = in->
shape[2];
1212 size_t channels = in->
shape[3];
1214 for (
size_t n = 0; n < batch_size; n += 1) {
1215 for (
size_t c = 0; c < channels; c += 1) {
1216 for (
size_t h = 0; h < height; h += 1) {
1217 for (
size_t w = 0; w < width; w += 1) {
1218 size_t nhwc_index = n * height * width * channels + h * width * channels + w * channels + c;
1219 size_t nchw_index = n * channels * height * width + c * height * width + h * width + w;
1220 ((
float *)out->
data)[nchw_index] = ((
float *)in->
data)[nhwc_index];
1239 const size_t *stride,
const size_t *padding,
const size_t *dilation,
size_t groups) {
1241 size_t batch_size = in->
shape[0];
1242 size_t in_height = in->
shape[1];
1243 size_t in_width = in->
shape[2];
1244 size_t in_channels = in->
shape[3];
1246 size_t out_height = out->
shape[1];
1247 size_t out_width = out->
shape[2];
1248 size_t out_channels = out->
shape[3];
1250 size_t kernel_height = weight->
shape[0];
1251 size_t kernel_width = weight->
shape[1];
1252 size_t stride_height = stride[0];
1253 size_t stride_width = stride[1];
1254 size_t padding_height = padding[0];
1255 size_t padding_width = padding[1];
1256 size_t dilation_height = dilation[0];
1257 size_t dilation_width = dilation[1];
1259 nn_assert(out->
shape[0] == batch_size,
"Cannot add tensors of different shapes");
1260 nn_assert(weight->
shape[3] == out_channels,
"Cannot add tensors of different shapes");
1261 nn_assert(weight->
shape[2] * groups == in_channels,
"Cannot add tensors of different shapes");
1262 nn_assert(out_height == (in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height + 1,
"Cannot add tensors of different shapes");
1263 nn_assert(out_width == (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1,
"Cannot add tensors of different shapes");
1264 nn_assert(groups > 0,
"Cannot add tensors of different shapes");
1265 nn_assert(in_channels % groups == 0,
"Cannot add tensors of different shapes");
1266 nn_assert(out_channels % groups == 0,
"Cannot add tensors of different shapes");
1270 memset(out->
data, 0, batch_size * out_height * out_width * out_channels *
sizeof(
float));
1272 #ifdef CONFIG_BACKEND_GEMMINI
1279 batch_size, in_height, in_width, in_channels,
1280 out_channels, out_height, out_width,
1281 stride_height, dilation_height, 1, padding_height, kernel_height,
1287 NO_ACTIVATION, ACC_SCALE_IDENTITY,
1291 else if (groups == in_channels) {
1292 assert(weight->
shape[2] == 1);
1294 Tensor *in_nchw = NN_tensor(4, (
size_t[]){batch_size, in_channels, in_height, in_width}, DTYPE_F32, NULL);
1295 Tensor *out_nchw = NN_tensor(4, (
size_t[]){batch_size, out_channels, out_height, out_width}, DTYPE_F32, NULL);
1297 Tensor *weight_1hwc = NN_tensor(4, (
size_t[]){1, kernel_height, kernel_width, out_channels}, DTYPE_F32, weight->
data);
1298 Tensor *weight_1chw = NN_tensor(4, (
size_t[]){1, out_channels, kernel_height, kernel_width}, DTYPE_F32, NULL);
1300 NN_nhwc_to_nchw(in_nchw, in);
1301 NN_nhwc_to_nchw(weight_1chw, weight_1hwc);
1303 for (
size_t g = 0; g < groups; g += 1) {
1305 batch_size, in_height, in_width, 1,
1306 1, out_height, out_width,
1307 stride_height, dilation_height, 1, padding_height, kernel_height,
1309 ((
float *)in_nchw->data) + g * in_height * in_width,
1310 ((
float *)weight_1chw->data) + g * kernel_height * kernel_width,
1311 ((
float *)bias->
data) + g,
1312 ((
float *)out_nchw->data) + g * out_height * out_width,
1313 NO_ACTIVATION, ACC_SCALE_IDENTITY,
1318 NN_nchw_to_nhwc(out, out_nchw);
1322 printf(
"[ERROR] Unsupported conv2d operation for groups other than 1 or in_channels\n");
1330 for (
size_t b = 0; b < batch_size; b += 1) {
1331 for (
size_t oc = 0; oc < out_channels; oc += 1) {
1332 for (
size_t oh = 0; oh < out_height; oh += 1) {
1333 for (
size_t ow = 0; ow < out_width; ow += 1) {
1335 for (
size_t kh = 0; kh < kernel_height; kh += 1) {
1336 for (
size_t kw = 0; kw < kernel_width; kw += 1) {
1337 for (
size_t ic = 0; ic < in_channels; ic += 1) {
1338 size_t ih = oh * stride_height + kh * dilation_height - padding_height;
1339 size_t iw = ow * stride_width + kw * dilation_width - padding_width;
1340 if (ih < in_height && iw < in_width) {
1341 size_t in_idx = b * in_height * in_width * in_channels
1342 + ih * in_width * in_channels
1345 size_t weight_idx = kh * kernel_width * in_channels * out_channels
1346 + kw * in_channels * out_channels
1349 value += ((
float*)in->
data)[in_idx] * ((
float*)weight->
data)[weight_idx];
1355 value += ((
float*)bias->
data)[oc];
1357 size_t out_idx = b * out_height * out_width * out_channels
1358 + oh * out_width * out_channels
1361 ((
float*)out->
data)[out_idx] = value;
1367 else if (groups == in_channels) {
1369 for (
size_t b = 0; b < batch_size; b += 1) {
1370 for (
size_t oc = 0; oc < out_channels; oc += 1) {
1371 for (
size_t oh = 0; oh < out_height; oh += 1) {
1372 for (
size_t ow = 0; ow < out_width; ow += 1) {
1374 for (
size_t kh = 0; kh < kernel_height; kh += 1) {
1375 for (
size_t kw = 0; kw < kernel_width; kw += 1) {
1376 size_t ih = oh * stride_height + kh * dilation_height - padding_height;
1377 size_t iw = ow * stride_width + kw * dilation_width - padding_width;
1378 if (ih < in_height && iw < in_width) {
1379 size_t in_idx = b * in_height * in_width * in_channels
1380 + ih * in_width * in_channels
1383 size_t weight_idx = kh * kernel_width * in_channels
1386 value += ((
float *)in->
data)[in_idx] * ((
float *)weight->
data)[weight_idx];
1391 value += ((
float *)bias->
data)[oc];
1393 size_t out_idx = b * out_height * out_width * out_channels
1394 + oh * out_width * out_channels
1397 ((
float *)out->
data)[out_idx] = value;
1404 printf(
"[ERROR] Unsupported conv2d operation for groups other than 1 or in_channels\n");
1429 size_t n = y->
shape[0];
1430 float *x1_data = x1->
data;
1431 float *x2_data = x2->
data;
1432 float *y_data = y->
data;
1434 #ifdef CONFIG_BACKEND_RISCV_V
1436 size_t vl = __riscv_vsetvl_e32m1(n);
1437 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_data, vl);
1438 vfloat32m1_t vec_x2 = __riscv_vle32_v_f32m1(x2_data, vl);
1439 vfloat32m1_t vec_y = __riscv_vfmul_vv_f32m1(vec_x1, vec_x2, vl);
1440 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1448 for (
size_t i = 0; i < n; i += 1) {
1449 sum += x1_data[i] * x2_data[i];
1467 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform MV on tensors of different shapes");
1468 nn_assert(y->
shape[0] == x1->
shape[0],
"Cannot perform MV on tensors of different shapes");
1470 const size_t n = x1->
shape[0];
1471 const size_t m = x1->
shape[1];
1472 float *x1_data = x1->
data;
1473 float *x2_data = x2->
data;
1474 float *y_data = y->
data;
1476 for (
size_t i = 0; i < y->
shape[0]; i += 1) {
1478 for (
size_t j = 0; j < m; j += 1) {
1479 sum += x1_data[i * m + j] * x2_data[j];
1498 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform MatMul on tensors of different shapes");
1501 const size_t n = x1->
shape[0];
1502 const size_t m = x1->
shape[1];
1503 const size_t p = x2->
shape[1];
1505 for (
size_t i = 0; i < n; i += 1) {
1506 float *x1_row = x1->
data + i * m;
1507 float *y_row = y->
data + i * p;
1509 #ifdef CONFIG_BACKEND_RISCV_V
1511 size_t vlmax = __riscv_vsetvlmax_e32m1();
1512 for (
size_t j = 0; j < p; j += 1) {
1513 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1514 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1516 float *x2_col = x2->
data + j;
1520 size_t vl = __riscv_vsetvl_e32m1(k);
1521 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_row, vl);
1522 vfloat32m1_t vec_x2 = __riscv_vlse32_v_f32m1(x2_col, p *
sizeof(
float), vl);
1523 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x1, vec_x2, vl);
1530 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1531 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1533 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1535 y_row[j] = __riscv_vfmv_f_s_f32m1_f32(vec_sum);
1538 for (
size_t j = 0; j < p; j += 1) {
1539 float *x2_row = x2->
data + j;
1542 for (
size_t k = 0; k < m; k += 1) {
1543 sum += x1_row[k] * x2_row[k * p];
1564 nn_assert(x1->
shape[1] == x2->
shape[0],
"Cannot perform MatMulAdd on tensors of different shapes");
1567 const size_t n = x1->
shape[0];
1568 const size_t m = x1->
shape[1];
1569 const size_t p = x2->
shape[1];
1571 for (
size_t i = 0; i < n; i += 1) {
1572 float *x1_row = x1->
data + i * m;
1573 float *c_row = c->
data + i * p;
1574 float *y_row = y->
data + i * p;
1576 #ifdef CONFIG_BACKEND_RISCV_V
1578 size_t vlmax = __riscv_vsetvlmax_e32m1();
1579 for (
size_t j = 0; j < p; j += 1) {
1580 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1581 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1583 float *x2_col = x2->
data + j;
1587 size_t vl = __riscv_vsetvl_e32m1(k);
1588 vfloat32m1_t vec_x1 = __riscv_vle32_v_f32m1(x1_row, vl);
1589 vfloat32m1_t vec_x2 = __riscv_vlse32_v_f32m1(x2_col, p *
sizeof(
float), vl);
1590 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x1, vec_x2, vl);
1597 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1598 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1600 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1602 y_row[j] = __riscv_vfmv_f_s_f32m1_f32(vec_sum) + c_row[j];
1606 for (
size_t j = 0; j < p; j += 1) {
1607 float *x2_col = x2->
data + j;
1610 for (
size_t k = 0; k < m; k += 1) {
1611 sum += x1_row[k] * x2_col[k * p];
1613 y_row[j] = sum + c_row[j];
1634 nn_assert(x->
shape[1] == weight->
shape[1],
"Cannot perform Linear on tensors of different shapes");
1635 nn_assert(!bias || bias->
shape[0] == weight->
shape[0],
"Cannot perform Linear on tensors of different shapes");
1638 const size_t batch_size = x->
shape[0];
1639 const size_t in_features = x->
shape[1];
1640 const size_t out_features = weight->
shape[0];
1642 float *x_batch_data = x->
data;
1643 float *y_batch_data = y->
data;
1645 for (
size_t i = 0; i < batch_size; i += 1) {
1646 float *x_data = x_batch_data;
1647 float *y_data = y_batch_data;
1649 #ifdef CONFIG_BACKEND_RISCV_V
1650 size_t vlmax = __riscv_vsetvlmax_e32m1();
1652 for (
size_t j = 0; j < out_features; j += 1) {
1653 vfloat32m1_t vec_zero = __riscv_vfmv_v_f_f32m1(0, vlmax);
1654 vfloat32m1_t vec_sum = __riscv_vfmv_v_f_f32m1(0, vlmax);
1656 float *weight_row = weight->
data + j * in_features;
1657 size_t n = in_features;
1660 size_t vl = __riscv_vsetvl_e32m1(n);
1661 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1662 vfloat32m1_t vec_w = __riscv_vle32_v_f32m1(weight_row, vl);
1663 vec_sum = __riscv_vfmacc_vv_f32m1(vec_sum, vec_x, vec_w, vl);
1670 #ifdef CONFIG_DEBUG_RISCV_V_USE_REDOSUM
1671 vec_sum = __riscv_vfredosum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1673 vec_sum = __riscv_vfredusum_vs_f32m1_f32m1(vec_sum, vec_zero, vlmax);
1676 float sum = __riscv_vfmv_f_s_f32m1_f32(vec_sum);
1678 sum += bias->
data[j];
1681 x_data = x_batch_data;
1684 for (
size_t j = 0; j < out_features; j += 1) {
1685 float *weight_row = weight->
data + j * in_features;
1688 for (
size_t k = 0; k < in_features; k += 1) {
1689 sum += x_data[k] * weight_row[k];
1692 sum += bias->
data[j];
1698 x_batch_data += in_features;
1699 y_batch_data += out_features;
1723 float *x_data = x->
data;
1724 float *y_data = y->
data;
1726 for (
size_t i = 0; i < n; i += 1) {
1727 if (x_data[i] > 0) {
1728 y_data[i] = x_data[i];
1731 y_data[i] = alpha * (expf(x_data[i]) - 1.f);
1750 float *x_data = x->
data;
1751 float *y_data = y->
data;
1753 #ifdef CONFIG_BACKEND_RISCV_V
1757 size_t vl = __riscv_vsetvl_e32m1(n);
1758 vfloat32m1_t vec_x = __riscv_vle32_v_f32m1(x_data, vl);
1759 vfloat32m1_t vec_y = __riscv_vfmax_vf_f32m1(vec_x, zero, vl);
1760 __riscv_vse32_v_f32m1(y_data, vec_y, vl);
1766 for (
size_t i = 0; i < n; i += 1) {
1767 float x_val = x_data[i];
1768 y_data[i] = x_val > 0 ? x_val : 0;
1784 nn_assert(x->
shape[0] == y->
shape[0],
"Cannot perform SiLU on tensors of different shapes");
1786 const size_t n = y->
shape[0];
1787 float *x_data = x->
data;
1788 float *y_data = y->
data;
1790 for (
size_t i = 0; i < n; i++) {
1791 float x_i = x_data[i];
1792 float sigmoid_x = 1.0f / (1.0f + expf(-x_i));
1793 y_data[i] = x_i * sigmoid_x;
1810 const size_t n = y->
shape[0];
1811 float *x_data = x->
data;
1812 float *y_data = y->
data;
1815 for (
size_t i = 0; i < n; i += 1) {
1816 y_data[i] = expf(x_data[i]);
1820 for (
size_t i = 0; i < n; i += 1) {
1839 float *y_data = y->
data;
1840 float *x_data = x->
data;
1843 for (
size_t i = 0; i < y->
shape[1]; i += 1) {
1844 size_t n = y->
shape[0];
1845 size_t m = y->
shape[1];
1847 for (
size_t j = 0; j < n; j += 1) {
1848 sum += expf(x_data[j * m]);
1851 for (
size_t j = 0; j < n; j += 1) {
1852 y_data[j * m] = expf(x_data[j * m]) / sum;
1859 else if (dim == 1) {
1861 for (
size_t i = 0; i < y->
shape[0]; i += 1) {
1862 size_t n = y->
shape[1];
1864 for (
size_t j = 0; j < n; j += 1) {
1865 sum += expf(x_data[j]);
1868 for (
size_t j = 0; j < n; j += 1) {
1869 y_data[j] = expf(x_data[j]) / sum;
1877 nn_assert(0,
"Invalid dimension for softmax");
1892 float *x_data = x->
data;
1893 float *y_data = y->
data;
1895 for (
size_t i = 0; i < n; i += 1) {
1896 float x_val = x_data[i];
1897 y_data[i] = tanhf(x_val);
1902 nn_assert(x->
shape[0] == y->
shape[0],
"Cannot perform RMSNorm on tensors of different shapes");
1904 const size_t n = y->
shape[0];
1905 float *x_data = x->
data;
1906 float *y_data = y->
data;
1907 float *w_data = weight->
data;
1910 for (
size_t i = 0; i < n; i += 1) {
1911 ss += x_data[i] * x_data[i];
1949 nn_assert(query->
shape[1] == key->
shape[1] && query->
shape[1] == value->
shape[1],
"Query, key, and value must have the same number of heads");
1950 nn_assert(key->
shape[2] == value->
shape[2],
"Key and value must have the same sequence length");
1951 nn_assert(query->
shape[3] == key->
shape[3],
"Query and key must have the same embedding dimension");
1953 size_t n = query->
shape[0];
1954 size_t h = query->
shape[1];
1955 size_t l = query->
shape[2];
1956 size_t s = key->
shape[2];
1957 size_t e = query->
shape[3];
1958 size_t ev = value->
shape[3];
1961 float scale_factor = 1.0f / sqrt(e);
1964 for (
size_t batch = 0; batch < n; batch += 1) {
1966 for (
size_t head = 0; head < h; head += 1) {
1968 size_t query_head_dims[2] = {l, e};
1969 size_t key_head_dims[2] = {s, e};
1970 size_t key_transposed_dims[2] = {e, s};
1971 size_t attn_weight_head_dims[2] = {l, s};
1972 size_t value_head_dims[2] = {s, ev};
1973 size_t y_head_dims[2] = {l, ev};
1976 float *query_data = (
float *)query->
data + (batch * h * l * e) + (head * l * e);
1977 float *key_data = (
float *)key->
data + (batch * h * s * e) + (head * s * e);
1978 float *value_data = (
float *)value->
data + (batch * h * s * ev) + (head * s * ev);
1979 float *y_data = (
float *)y->
data + (batch * h * l * ev) + (head * l * ev);
1989 for (
size_t i = 0; i < s; i += 1) {
1990 for (
size_t j = 0; j < e; j += 1) {
1991 key_transposed->
data[j * s + i] = key_head->
data[i * e + j];
1997 nn_mm_f32(attn_weight_head, query_head, key_transposed);
2007 nn_mm_f32(y_head, attn_weight_head, value_head);
2012 free(key_transposed->
data);
2013 free(key_transposed);
2014 free(attn_weight_head->
data);
2015 free(attn_weight_head);
void nn_print_f32(float v, int16_t num_digits)
Definition: nn.h:96
static void nn_assert(int condition, char *message)
Definition: nn.h:59
void nn_linear_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, const Tensor2D_F32 *weight, const Tensor1D_F32 *bias)
Linear neural network layer.
Definition: nn_f32.h:1633
Tensor3D_F32 * nn_zeros3d_f32(size_t shape[3])
Returns a 3D floating-point data tensor filled with the scalar value 0, with the shape defined by the...
Definition: nn_f32.h:314
void nn_softmax1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x)
Applies the softmax activation function to a 1D floating-point tensor.
Definition: nn_f32.h:1807
Tensor1D_F32 * nn_zeros1d_f32(size_t shape[1])
Returns a 1D floating-point tensor filled with the scalar value 0, with the shape defined by the 1-el...
Definition: nn_f32.h:282
void nn_print_tensor4d_f32(const Tensor4D_F32 *tensor)
Prints the content of a 4D floating-point data tensor.
Definition: nn_f32.h:568
uint8_t nn_equals0d_f32(const Tensor0D_F32 *a, const Tensor0D_F32 *b, float rel_err)
Checks if two 0D floating-point tensors are equal.
Definition: nn_f32.h:629
Tensor0D_F32 * nn_full0d_f32(float data)
Returns a 0D floating-point data tensor (scalar) filled with the scalar value data.
Definition: nn_f32.h:388
void nn_mm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Performs a matrix multiplication of the matrices x1 and x2.
Definition: nn_f32.h:1497
Tensor2D_F32 * nn_rand2d_f32(size_t shape[2])
Returns a 2D floating-point data tensor filled with random floating-point numbers,...
Definition: nn_f32.h:460
void nn_mulscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar)
Multiplies a scalar with a 2D floating-point tensor and stores the result in y.
Definition: nn_f32.h:1133
void nn_mv_f32(Tensor1D_F32 *y, const Tensor2D_F32 *x1, const Tensor1D_F32 *x2)
Performs a matrix-vector multiplication of the matrix x1 and the vector x2.
Definition: nn_f32.h:1466
void nn_rms_norm1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, const Tensor1D_F32 *weight, float eps)
Definition: nn_f32.h:1901
void nn_addscalar2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float scalar)
Adds a scalar to a 2D floating-point tensor and stores the result in y.
Definition: nn_f32.h:981
Tensor2D_F32 * nn_full2d_f32(size_t shape[2], float data)
Returns a 2D floating-point data tensor filled with the scalar value data, with the shape defined by ...
Definition: nn_f32.h:418
uint8_t nn_equals3d_f32(const Tensor3D_F32 *a, const Tensor3D_F32 *b, float rel_err)
Checks if two 3D floating-point tensors are equal.
Definition: nn_f32.h:687
void nn_max2d_f32(Tensor0D_F32 *y, const Tensor2D_F32 *x)
Finds the maximum value in a 2D floating-point tensor.
Definition: nn_f32.h:766
Tensor3D_F32 * nn_tensor3d_f32(size_t shape[3], const float *data)
Creates a 3D floating-point data tensor, with the shape defined by the 3-element array shape.
Definition: nn_f32.h:159
void nn_relu2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x)
Applies the ReLU activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1746
Tensor2D_F32 * nn_tensor2d_f32(size_t shape[2], const float *data)
Creates a 2D floating-point data tensor, with the shape defined by the 2-element array shape.
Definition: nn_f32.h:136
void nn_add2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:907
void nn_scaled_dot_product_attention_f32(Tensor4D_F32 *y, const Tensor4D_F32 *query, const Tensor4D_F32 *key, const Tensor4D_F32 *value)
Computes scaled dot product attention on query, key and value tensors.
Definition: nn_f32.h:1947
void nn_dot_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Computes the dot product of two 1D floating-point tensors.
Definition: nn_f32.h:1425
void nn_addmm_f32(Tensor2D_F32 *y, const Tensor2D_F32 *c, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Performs a matrix multiplication of two 2D floating-point tensors and adds the result to a third tens...
Definition: nn_f32.h:1563
void nn_conv2d_f32(Tensor4D_F32 *out, const Tensor4D_F32 *in, const Tensor4D_F32 *weight, const Tensor1D_F32 *bias, const size_t *stride, const size_t *padding, const size_t *dilation, size_t groups)
Performs a 2D convolution on a 4D tensor.
Definition: nn_f32.h:1236
Tensor1D_F32 * nn_ones1d_f32(size_t shape[1])
Returns a 1D floating-point data tensor filled with the scalar value 1, with the shape defined by the...
Definition: nn_f32.h:356
void nn_min2d_f32(Tensor0D_F32 *y, const Tensor2D_F32 *x)
Finds the minimum value in a 2D floating-point tensor.
Definition: nn_f32.h:830
Tensor0D_F32 * nn_ones0d_f32()
Returns a 0D floating-point data tensor (scalar) filled with the scalar value 1.
Definition: nn_f32.h:344
static uint8_t nn_equal_f32(float golden, float actual, float rel_err)
Checks if two floating-point numbers are equal within a relative error.
Definition: nn_f32.h:80
Tensor4D_F32 * nn_tensor4d_f32(size_t shape[4], const float *data)
Creates a 4D floating-point data tensor, with the shape defined by the 4-element array shape.
Definition: nn_f32.h:183
void nn_elu2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, float alpha)
Applies the ELU activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1719
void nn_print_tensor3d_f32(const Tensor3D_F32 *tensor)
Prints the content of a 3D floating-point data tensor.
Definition: nn_f32.h:529
void nn_addscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar)
Adds a scalar to a 1D floating-point tensor and stores the result in y.
Definition: nn_f32.h:946
void nn_tanh2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x)
Applies the tanh activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1888
Tensor2D_F32 * nn_as_tensor2d_f32(size_t shape[2], float *data)
Converts data into a tensor, with the shape defined by the 2-element array shape.
Definition: nn_f32.h:221
Tensor1D_F32 * nn_rand1d_f32(size_t shape[1])
Returns a 1D floating-point data tensor filled with random floating-point numbers,...
Definition: nn_f32.h:444
void nn_mul1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:1020
void nn_softmax2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x, size_t dim)
Applies the softmax activation function to a 2D floating-point tensor.
Definition: nn_f32.h:1836
uint8_t nn_equals4d_f32(const Tensor4D_F32 *a, const Tensor4D_F32 *b, float rel_err)
Checks if two 4D floating-point tensors are equal.
Definition: nn_f32.h:709
void nn_mulscalar1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x, float scalar)
Multiplies a scalar with a 1D floating-point tensor and stores the result in y.
Definition: nn_f32.h:1098
void nn_add1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x1, const Tensor1D_F32 *x2)
Adds x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:868
Tensor2D_F32 * nn_zeros2d_f32(size_t shape[2])
Returns a 2D floating-point data tensor filled with the scalar value 0, with the shape defined by the...
Definition: nn_f32.h:298
uint8_t nn_equals1d_f32(const Tensor1D_F32 *a, const Tensor1D_F32 *b, float rel_err)
Checks if two 1D floating-point tensors are equal.
Definition: nn_f32.h:643
void nn_nchw_to_nhwc_f32(Tensor4D_F32 *out, const Tensor4D_F32 *in)
Converts a 4D tensor from NCHW format to NHWC format.
Definition: nn_f32.h:1170
void nn_print_tensor2d_f32(const Tensor2D_F32 *tensor)
Prints the content of a 2D floating-point data tensor.
Definition: nn_f32.h:499
void nn_silu1d_f32(Tensor1D_F32 *y, const Tensor1D_F32 *x)
Applies the SiLU (Sigmoid Linear Unit) activation function to a 1D floating-point tensor.
Definition: nn_f32.h:1783
void nn_min1d_f32(Tensor0D_F32 *y, const Tensor1D_F32 *x)
Finds the minimum value in a 1D floating-point tensor.
Definition: nn_f32.h:798
Tensor0D_F32 * nn_rand0d_f32()
Returns a 0D floating-point data tensor (scalar) filled with a random floating-point number.
Definition: nn_f32.h:432
Tensor1D_F32 * nn_as_tensor1d_f32(size_t shape[1], float *data)
Converts data into a tensor, with the shape defined by the 1-element array shape.
Definition: nn_f32.h:206
void nn_mul2d_f32(Tensor2D_F32 *y, const Tensor2D_F32 *x1, const Tensor2D_F32 *x2)
Multiplies x1 and x2 element-wise and stores the result in y.
Definition: nn_f32.h:1059
Tensor3D_F32 * nn_as_tensor3d_f32(size_t shape[3], float *data)
Converts data into a tensor, with the shape defined by the 3-element array shape.
Definition: nn_f32.h:237
Tensor1D_F32 * nn_full1d_f32(size_t shape[1], float data)
Returns a 1D floating-point data tensor filled with the scalar value data, with the shape defined by ...
Definition: nn_f32.h:401
Tensor0D_F32 * nn_zeros0d_f32()
Returns a 0D floating-point tensor (scalar) filled with the scalar value 0.
Definition: nn_f32.h:270
uint8_t nn_equals2d_f32(const Tensor2D_F32 *a, const Tensor2D_F32 *b, float rel_err)
Checks if two 2D floating-point tensors are equal.
Definition: nn_f32.h:665
Tensor0D_F32 * nn_tensor0d_f32(float data)
Creates a 0D floating-point data tensor.
Definition: nn_f32.h:98
Tensor4D_F32 * nn_as_tensor4d_f32(size_t shape[4], float *data)
Converts data into a tensor, with the shape defined by the 4-element array shape.
Definition: nn_f32.h:254
void nn_print_tensor1d_f32(const Tensor1D_F32 *tensor)
Prints the content of a 1D floating-point data tensor.
Definition: nn_f32.h:481
Tensor2D_F32 * nn_ones2d_f32(size_t shape[2])
Returns a 2D floating-point data tensor filled with the scalar value 1, with the shape defined by the...
Definition: nn_f32.h:372
void nn_max1d_f32(Tensor0D_F32 *y, const Tensor1D_F32 *x)
Finds the maximum value in a 1D floating-point tensor.
Definition: nn_f32.h:734
Tensor1D_F32 * nn_tensor1d_f32(size_t shape[1], const float *data)
Creates a 1D floating-point data tensor, with the shape defined by the 1-element array shape.
Definition: nn_f32.h:114
void nn_nhwc_to_nchw_f32(Tensor4D_F32 *out, const Tensor4D_F32 *in)
Converts a 4D tensor from NHWC format to NCHW format.
Definition: nn_f32.h:1203
Tensor4D_F32 * nn_zeros4d_f32(size_t shape[4])
Returns a 4D floating-point data tensor filled with the scalar value 0, with the shape defined by the...
Definition: nn_f32.h:330
float data
Definition: nn_f32.h:24
A 0D tensor (scalar) with a float data type.
Definition: nn_f32.h:23
size_t shape[1]
Definition: nn_f32.h:34
float * data
Definition: nn_f32.h:35
A 1D tensor with a float data type.
Definition: nn_f32.h:33
float * data
Definition: nn_f32.h:46
size_t shape[2]
Definition: nn_f32.h:45
A 2D tensor with a float data type.
Definition: nn_f32.h:44
size_t shape[3]
Definition: nn_f32.h:55
float * data
Definition: nn_f32.h:56
A 3D tensor with a float data type.
Definition: nn_f32.h:54
size_t shape[4]
Definition: nn_f32.h:65
float * data
Definition: nn_f32.h:66
A 4D tensor with a float data type.
Definition: nn_f32.h:64