Skip to content

Commit a6bdc47

Browse files
authored
Fix usage of F16C intrinsics in AVX code (#563)
* Fix usage of F16C intrinsics in AVX code when F16C is not defined
1 parent 7b8dbcb commit a6bdc47

File tree

1 file changed

+24
-1
lines changed

1 file changed

+24
-1
lines changed

ggml.c

+24-1
Original file line numberDiff line numberDiff line change
@@ -1122,13 +1122,36 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
11221122
#define GGML_F16_EPR 8
11231123

11241124
// F16 arithmetic is not supported by AVX, so we use F32 instead
1125-
// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32
11261125

11271126
#define GGML_F32Cx8 __m256
11281127
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
11291128
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
1129+
1130+
#if defined(__F16C__)
1131+
// the _mm256_cvt intrinsics require F16C
11301132
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
11311133
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
1134+
#else
1135+
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
1136+
float tmp[8];
1137+
1138+
for (int i = 0; i < 8; i++)
1139+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
1140+
1141+
return _mm256_loadu_ps(tmp);
1142+
}
1143+
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1144+
float arr[8];
1145+
1146+
_mm256_storeu_ps(arr, y);
1147+
1148+
for (int i = 0; i < 8; i++)
1149+
x[i] = GGML_FP16_TO_FP32(arr[i]);
1150+
}
1151+
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
1152+
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
1153+
#endif
1154+
11321155
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
11331156
#define GGML_F32Cx8_ADD _mm256_add_ps
11341157
#define GGML_F32Cx8_MUL _mm256_mul_ps

0 commit comments

Comments
 (0)