Skip to content

Commit 14a0b20

Browse files
committed
ggml : implement vaddvq when missing
1 parent 2ae3164 commit 14a0b20

File tree

1 file changed

+31
-14
lines changed

1 file changed

+31
-14
lines changed

Diff for: ggml.c

+31-14
Original file line numberDiff line numberDiff line change
@@ -492,26 +492,43 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
492492
#endif
493493

494494
#if __ARM_NEON
495+
// check if ARMv8 is not available
495496
#if !defined(__ARM_FEATURE_QRDMX)
496497

497-
inline static int16_t vaddvq_s16(int16x8_t v) {
498-
const int16x4_t v1 = vadd_s16(vget_low_s16(v), vget_high_s16(v));
499-
return vaddv_s16(v1);
498+
inline static uint16_t vaddvq_u8(uint8x16_t v) {
499+
return
500+
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
501+
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
502+
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
503+
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
504+
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
505+
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
506+
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
507+
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
500508
}
501509

502-
inline static uint16_t vaddvq_u16(uint16x8_t v) {
503-
const uint16x4_t v1 = vadd_u16(vget_low_u16(v), vget_high_u16(v));
504-
return vaddv_u16(v1);
510+
inline static int32_t vaddvq_s16(int16x8_t v) {
511+
return
512+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
513+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
514+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
515+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
516+
}
517+
518+
inline static uint32_t vaddvq_u16(uint16x8_t v) {
519+
return
520+
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
521+
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
522+
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
523+
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
505524
}
506525

507526
inline static int32_t vaddvq_s32(int32x4_t v) {
508-
const int32x2_t v1 = vadd_s32(vget_low_s32(v), vget_high_s32(v));
509-
return vaddv_s32(v1);
527+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
510528
}
511529

512530
inline static float vaddvq_f32(float32x4_t v) {
513-
const float32x2_t v1 = vadd_f32(vget_low_f32(v), vget_high_f32(v));
514-
return vaddv_f32(v1);
531+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
515532
}
516533

517534
#endif
@@ -2313,10 +2330,10 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
23132330
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
23142331
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
23152332

2316-
const uint16x8_t pl1l = vmull_u8(vget_low_s8 (v0_1l), vget_low_u8 (v1_1l));
2317-
const uint16x8_t pl1h = vmull_u8(vget_high_s8(v0_1l), vget_high_u8(v1_1l));
2318-
const uint16x8_t ph1l = vmull_u8(vget_low_s8 (v0_1h), vget_low_u8 (v1_1h));
2319-
const uint16x8_t ph1h = vmull_u8(vget_high_s8(v0_1h), vget_high_u8(v1_1h));
2333+
const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
2334+
const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
2335+
const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
2336+
const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
23202337

23212338
const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
23222339
const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);

0 commit comments

Comments
 (0)