@@ -104,6 +104,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
104
104
}
105
105
106
106
#if defined(GGML_SIMD )
107
+ #if defined(__riscv_v_intrinsic )
108
+ // todo: RVV impl
109
+ for (int i = 0 ; i < n ; ++ i ) {
110
+ for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
111
+ sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
112
+ }
113
+ }
114
+ #else
107
115
const int np = (n & ~(GGML_F16_STEP - 1 ));
108
116
109
117
GGML_F16_VEC sum [GGML_VEC_DOT_UNROLL ][GGML_F16_ARR ] = { { GGML_F16_VEC_ZERO } };
@@ -134,6 +142,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
134
142
sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
135
143
}
136
144
}
145
+ #endif
137
146
#else
138
147
for (int i = 0 ; i < n ; ++ i ) {
139
148
for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
@@ -228,6 +237,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
228
237
229
238
svst1_f32 (pg , y + np2 , ay1 );
230
239
}
240
+ #elif defined(__riscv_v_intrinsic )
241
+ for (int i = 0 , avl ; i < n ; i += avl ) {
242
+ avl = __riscv_vsetvl_e32m8 (n - i );
243
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
244
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
245
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , v , ay , avl );
246
+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
247
+ }
231
248
#else
232
249
const int np = (n & ~(GGML_F32_STEP - 1 ));
233
250
@@ -261,6 +278,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
261
278
262
279
inline static void ggml_vec_mad_f16 (const int n , ggml_fp16_t * GGML_RESTRICT y , const ggml_fp16_t * GGML_RESTRICT x , const float v ) {
263
280
#if defined(GGML_SIMD )
281
+ #if defined(__riscv_v_intrinsic )
282
+ // todo: RVV impl
283
+ // scalar
284
+ for (int i = 0 ; i < n ; ++ i ) {
285
+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
286
+ }
287
+ #else
264
288
const int np = (n & ~(GGML_F16_STEP - 1 ));
265
289
266
290
GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -282,6 +306,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
282
306
for (int i = np ; i < n ; ++ i ) {
283
307
y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
284
308
}
309
+ #endif
285
310
#else
286
311
// scalar
287
312
for (int i = 0 ; i < n ; ++ i ) {
@@ -309,6 +334,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
309
334
y [i ] += x [k ][i ]* v [k ][0 ];
310
335
}
311
336
}
337
+ #elif defined(__riscv_v_intrinsic )
338
+ for (int i = 0 , avl ; i < n ; i += avl ) {
339
+ avl = __riscv_vsetvl_e32m8 (n - i );
340
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
341
+ for (int k = 0 ; k < GGML_VEC_MAD_UNROLL ; k ++ ) {
342
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [k ][i ], avl );
343
+ ay = __riscv_vfmadd_vf_f32m8 (ax , v [k ][0 ], ay , avl );
344
+ }
345
+ __riscv_vse32_v_f32m8 (& y [i ], ay , avl );
346
+ }
312
347
#else
313
348
const int np = (n & ~(GGML_F32_STEP - 1 ));
314
349
@@ -360,6 +395,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
360
395
for (int i = 0 ; i < n ; ++ i ) {
361
396
y [i ] = x [i ]* s + b ;
362
397
}
398
+ #elif defined(__riscv_v_intrinsic )
399
+ for (int i = 0 , avl ; i < n ; i += avl ) {
400
+ avl = __riscv_vsetvl_e32m8 (n - i );
401
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
402
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8 (b , avl );
403
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , s , vb , avl );
404
+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
405
+ }
363
406
#else
364
407
const int np = (n & ~(GGML_F32_STEP - 1 ));
365
408
@@ -421,6 +464,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
421
464
ay1 = svmul_f32_m (pg , ay1 , vx );
422
465
svst1_f32 (pg , y + np , ay1 );
423
466
}
467
+ #elif defined(__riscv_v_intrinsic )
468
+ for (int i = 0 , avl ; i < n ; i += avl ) {
469
+ avl = __riscv_vsetvl_e32m8 (n - i );
470
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
471
+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8 (ay , v , avl );
472
+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
473
+ }
424
474
#else
425
475
const int np = (n & ~(GGML_F32_STEP - 1 ));
426
476
@@ -452,6 +502,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
452
502
453
503
inline static void ggml_vec_scale_f16 (const int n , ggml_fp16_t * y , const float v ) {
454
504
#if defined(GGML_SIMD )
505
+ #if defined(__riscv_v_intrinsic )
506
+ // todo: RVV impl
507
+ // scalar
508
+ for (int i = 0 ; i < n ; ++ i ) {
509
+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
510
+ }
511
+ #else
455
512
const int np = (n & ~(GGML_F16_STEP - 1 ));
456
513
457
514
GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -471,6 +528,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
471
528
for (int i = np ; i < n ; ++ i ) {
472
529
y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
473
530
}
531
+ #endif
474
532
#else
475
533
// scalar
476
534
for (int i = 0 ; i < n ; ++ i ) {
0 commit comments