@@ -492,26 +492,43 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
492
492
#endif
493
493
494
494
#if __ARM_NEON
495
+ // check if ARMv8 is not available
495
496
#if !defined(__ARM_FEATURE_QRDMX )
496
497
497
- inline static int16_t vaddvq_s16 (int16x8_t v ) {
498
- const int16x4_t v1 = vadd_s16 (vget_low_s16 (v ), vget_high_s16 (v ));
499
- return vaddv_s16 (v1 );
498
+ inline static uint16_t vaddvq_u8 (uint8x16_t v ) {
499
+ return
500
+ (uint16_t )vgetq_lane_u8 (v , 0 ) + (uint16_t )vgetq_lane_u8 (v , 1 ) +
501
+ (uint16_t )vgetq_lane_u8 (v , 2 ) + (uint16_t )vgetq_lane_u8 (v , 3 ) +
502
+ (uint16_t )vgetq_lane_u8 (v , 4 ) + (uint16_t )vgetq_lane_u8 (v , 5 ) +
503
+ (uint16_t )vgetq_lane_u8 (v , 6 ) + (uint16_t )vgetq_lane_u8 (v , 7 ) +
504
+ (uint16_t )vgetq_lane_u8 (v , 8 ) + (uint16_t )vgetq_lane_u8 (v , 9 ) +
505
+ (uint16_t )vgetq_lane_u8 (v , 10 ) + (uint16_t )vgetq_lane_u8 (v , 11 ) +
506
+ (uint16_t )vgetq_lane_u8 (v , 12 ) + (uint16_t )vgetq_lane_u8 (v , 13 ) +
507
+ (uint16_t )vgetq_lane_u8 (v , 14 ) + (uint16_t )vgetq_lane_u8 (v , 15 );
500
508
}
501
509
502
- inline static uint16_t vaddvq_u16 (uint16x8_t v ) {
503
- const uint16x4_t v1 = vadd_u16 (vget_low_u16 (v ), vget_high_u16 (v ));
504
- return vaddv_u16 (v1 );
510
+ inline static int32_t vaddvq_s16 (int16x8_t v ) {
511
+ return
512
+ (int32_t )vgetq_lane_s16 (v , 0 ) + (int32_t )vgetq_lane_s16 (v , 1 ) +
513
+ (int32_t )vgetq_lane_s16 (v , 2 ) + (int32_t )vgetq_lane_s16 (v , 3 ) +
514
+ (int32_t )vgetq_lane_s16 (v , 4 ) + (int32_t )vgetq_lane_s16 (v , 5 ) +
515
+ (int32_t )vgetq_lane_s16 (v , 6 ) + (int32_t )vgetq_lane_s16 (v , 7 );
516
+ }
517
+
518
+ inline static uint32_t vaddvq_u16 (uint16x8_t v ) {
519
+ return
520
+ (uint32_t )vgetq_lane_u16 (v , 0 ) + (uint32_t )vgetq_lane_u16 (v , 1 ) +
521
+ (uint32_t )vgetq_lane_u16 (v , 2 ) + (uint32_t )vgetq_lane_u16 (v , 3 ) +
522
+ (uint32_t )vgetq_lane_u16 (v , 4 ) + (uint32_t )vgetq_lane_u16 (v , 5 ) +
523
+ (uint32_t )vgetq_lane_u16 (v , 6 ) + (uint32_t )vgetq_lane_u16 (v , 7 );
505
524
}
506
525
507
526
inline static int32_t vaddvq_s32 (int32x4_t v ) {
508
- const int32x2_t v1 = vadd_s32 (vget_low_s32 (v ), vget_high_s32 (v ));
509
- return vaddv_s32 (v1 );
527
+ return vgetq_lane_s32 (v , 0 ) + vgetq_lane_s32 (v , 1 ) + vgetq_lane_s32 (v , 2 ) + vgetq_lane_s32 (v , 3 );
510
528
}
511
529
512
530
inline static float vaddvq_f32 (float32x4_t v ) {
513
- const float32x2_t v1 = vadd_f32 (vget_low_f32 (v ), vget_high_f32 (v ));
514
- return vaddv_f32 (v1 );
531
+ return vgetq_lane_f32 (v , 0 ) + vgetq_lane_f32 (v , 1 ) + vgetq_lane_f32 (v , 2 ) + vgetq_lane_f32 (v , 3 );
515
532
}
516
533
517
534
#endif
@@ -2313,10 +2330,10 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
2313
2330
const uint16x8_t ph0l = vmull_u8 (vget_low_u8 (v0_0h ), vget_low_u8 (v1_0h ));
2314
2331
const uint16x8_t ph0h = vmull_u8 (vget_high_u8 (v0_0h ), vget_high_u8 (v1_0h ));
2315
2332
2316
- const uint16x8_t pl1l = vmull_u8 (vget_low_s8 (v0_1l ), vget_low_u8 (v1_1l ));
2317
- const uint16x8_t pl1h = vmull_u8 (vget_high_s8 (v0_1l ), vget_high_u8 (v1_1l ));
2318
- const uint16x8_t ph1l = vmull_u8 (vget_low_s8 (v0_1h ), vget_low_u8 (v1_1h ));
2319
- const uint16x8_t ph1h = vmull_u8 (vget_high_s8 (v0_1h ), vget_high_u8 (v1_1h ));
2333
+ const uint16x8_t pl1l = vmull_u8 (vget_low_u8 (v0_1l ), vget_low_u8 (v1_1l ));
2334
+ const uint16x8_t pl1h = vmull_u8 (vget_high_u8 (v0_1l ), vget_high_u8 (v1_1l ));
2335
+ const uint16x8_t ph1l = vmull_u8 (vget_low_u8 (v0_1h ), vget_low_u8 (v1_1h ));
2336
+ const uint16x8_t ph1h = vmull_u8 (vget_high_u8 (v0_1h ), vget_high_u8 (v1_1h ));
2320
2337
2321
2338
const uint16x8_t pl_0 = vaddq_u16 (pl0l , pl0h );
2322
2339
const uint16x8_t ph_0 = vaddq_u16 (ph0l , ph0h );
0 commit comments