Skip to content

Commit 835397f

Browse files
committed
ggml-cpu : add basic RVV support for vector f32 ops
1 parent 4cb208c commit 835397f

File tree

5 files changed

+114
-18
lines changed

5 files changed

+114
-18
lines changed

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
435435
)
436436
if (GGML_RVV)
437437
if (GGML_XTHEADVECTOR)
438-
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
438+
list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
439439
elseif (GGML_RV_ZFH)
440440
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
441441
else()

ggml/src/ggml-cpu/ops.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8736,6 +8736,9 @@ static void ggml_compute_forward_ssm_scan_f32(
87368736
}
87378737

87388738
sumf = GGML_F32xt_REDUCE_ONE(sum);
8739+
#elif defined(__riscv_v_intrinsic)
8740+
// todo: RVV implementation
8741+
const int np = 0;
87398742
#else
87408743
const int np = (nc & ~(GGML_F32_STEP - 1));
87418744

@@ -9683,8 +9686,8 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
96839686
int64_t h_stride_2d = head_size * head_size;
96849687

96859688
#if defined(GGML_SIMD)
9686-
#if defined(__ARM_FEATURE_SVE)
9687-
// scalar Route to scalar implementation //TODO: Write SVE code
9689+
#if defined(__ARM_FEATURE_SVE) || defined(__riscv_v_intrinsic)
9690+
// scalar Route to scalar implementation //TODO: Write SVE code and RVV code
96889691
for (int64_t t = 0; t < T; t++) {
96899692
int64_t t_offset = t * t_stride;
96909693
int64_t state_offset = head_size * C * (t / (T / n_seqs));

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
#include <immintrin.h>
1919
#endif
2020

21+
#if defined(__riscv_v_intrinsic)
22+
#include <riscv_vector.h>
23+
#endif
24+
2125
#ifdef __cplusplus
2226
extern "C" {
2327
#endif
@@ -94,24 +98,15 @@ extern "C" {
9498
}
9599
#elif defined(__riscv) && defined(__riscv_zfhmin)
96100
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97-
float f;
98-
__asm__(
99-
"fmv.h.x %[f], %[h]\n\t"
100-
"fcvt.s.h %[f], %[f]"
101-
: [f] "=&f" (f)
102-
: [h] "r" (h)
103-
);
104-
return f;
101+
_Float16 hf;
102+
memcpy(&hf, &h, sizeof(ggml_fp16_t));
103+
return hf;
105104
}
106105

107106
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108107
ggml_fp16_t res;
109-
__asm__(
110-
"fcvt.h.s %[f], %[f]\n\t"
111-
"fmv.x.h %[h], %[f]"
112-
: [h] "=&r" (res)
113-
: [f] "f" (f)
114-
);
108+
_Float16 hf = (_Float16)f;
109+
memcpy(&res, &hf, sizeof(ggml_fp16_t));
115110
return res;
116111
}
117112

@@ -1170,6 +1165,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
11701165
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
11711166
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
11721167

1168+
#elif defined(__riscv_v_intrinsic)
1169+
1170+
// compatible with vlen >= 128
1171+
1172+
#define GGML_SIMD
1173+
1174+
// F32
1175+
1176+
#define GGML_F32_STEP 16
1177+
#define GGML_F32_EPR 4
1178+
1179+
#define GGML_F32x4 vfloat32m1_t
1180+
#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1181+
#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1182+
#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1183+
#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1184+
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1185+
#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1186+
#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1187+
1188+
#define GGML_F32_VEC GGML_F32x4
1189+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1190+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1191+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1192+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1193+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1194+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1195+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1196+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1197+
11731198
#endif
11741199

11751200
// GGML_F32_ARR / GGML_F16_ARR

ggml/src/ggml-cpu/vec.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
8484
}
8585
// reduce sum1,sum2 to sum1
8686
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87+
#elif defined(__riscv_v_intrinsic)
88+
vfloat32m1_t vsum = __riscv_vfmv_v_f_f32m1(0.0f, 1);
89+
for (int i = 0, avl; i < n; i += avl) {
90+
avl = __riscv_vsetvl_e32m8(n - i);
91+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
92+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
93+
vfloat32m8_t prod = __riscv_vfmul_vv_f32m8(ax, ay, avl);
94+
vsum = __riscv_vfredusum_vs_f32m8_f32m1(prod, vsum, avl);
95+
}
96+
sumf += __riscv_vfmv_f_s_f32m1_f32(vsum);
8797
#else
8898
const int np = (n & ~(GGML_F32_STEP - 1));
8999

@@ -197,7 +207,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
197207

198208
ggml_float sumf = 0.0;
199209

200-
#if defined(GGML_SIMD)
210+
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
201211
const int np = (n & ~(GGML_F16_STEP - 1));
202212

203213
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };

ggml/src/ggml-cpu/vec.h

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
104104
}
105105

106106
#if defined(GGML_SIMD)
107+
#if defined(__riscv_v_intrinsic)
108+
// todo: RVV impl
109+
for (int i = 0; i < n; ++i) {
110+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
111+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
112+
}
113+
}
114+
#else
107115
const int np = (n & ~(GGML_F16_STEP - 1));
108116

109117
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
@@ -134,6 +142,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
134142
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
135143
}
136144
}
145+
#endif
137146
#else
138147
for (int i = 0; i < n; ++i) {
139148
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
@@ -228,6 +237,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
228237

229238
svst1_f32(pg, y + np2, ay1);
230239
}
240+
#elif defined(__riscv_v_intrinsic)
241+
for (int i = 0, avl; i < n; i += avl) {
242+
avl = __riscv_vsetvl_e32m8(n - i);
243+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
244+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
245+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
246+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
247+
}
231248
#else
232249
const int np = (n & ~(GGML_F32_STEP - 1));
233250

@@ -261,6 +278,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
261278

262279
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
263280
#if defined(GGML_SIMD)
281+
#if defined(__riscv_v_intrinsic)
282+
// todo: RVV impl
283+
// scalar
284+
for (int i = 0; i < n; ++i) {
285+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
286+
}
287+
#else
264288
const int np = (n & ~(GGML_F16_STEP - 1));
265289

266290
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
@@ -282,6 +306,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
282306
for (int i = np; i < n; ++i) {
283307
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
284308
}
309+
#endif
285310
#else
286311
// scalar
287312
for (int i = 0; i < n; ++i) {
@@ -309,6 +334,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
309334
y[i] += x[k][i]*v[k][0];
310335
}
311336
}
337+
#elif defined(__riscv_v_intrinsic)
338+
for (int i = 0, avl; i < n; i += avl) {
339+
avl = __riscv_vsetvl_e32m8(n - i);
340+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
341+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
342+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
343+
ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
344+
}
345+
__riscv_vse32_v_f32m8(&y[i], ay, avl);
346+
}
312347
#else
313348
const int np = (n & ~(GGML_F32_STEP - 1));
314349

@@ -360,6 +395,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
360395
for (int i = 0; i < n; ++i) {
361396
y[i] = x[i]*s + b;
362397
}
398+
#elif defined(__riscv_v_intrinsic)
399+
for (int i = 0, avl; i < n; i += avl) {
400+
avl = __riscv_vsetvl_e32m8(n - i);
401+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
402+
vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
403+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
404+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
405+
}
363406
#else
364407
const int np = (n & ~(GGML_F32_STEP - 1));
365408

@@ -421,6 +464,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
421464
ay1 = svmul_f32_m(pg, ay1, vx);
422465
svst1_f32(pg, y + np, ay1);
423466
}
467+
#elif defined(__riscv_v_intrinsic)
468+
for (int i = 0, avl; i < n; i += avl) {
469+
avl = __riscv_vsetvl_e32m8(n - i);
470+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
471+
vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
472+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
473+
}
424474
#else
425475
const int np = (n & ~(GGML_F32_STEP - 1));
426476

@@ -452,6 +502,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
452502

453503
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
454504
#if defined(GGML_SIMD)
505+
#if defined(__riscv_v_intrinsic)
506+
// todo: RVV impl
507+
// scalar
508+
for (int i = 0; i < n; ++i) {
509+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
510+
}
511+
#else
455512
const int np = (n & ~(GGML_F16_STEP - 1));
456513

457514
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
@@ -471,6 +528,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
471528
for (int i = np; i < n; ++i) {
472529
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
473530
}
531+
#endif
474532
#else
475533
// scalar
476534
for (int i = 0; i < n; ++i) {

0 commit comments

Comments
 (0)