@@ -1698,8 +1698,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
1698
1698
// Horizontal sum of all lanes of the accumulator
1699
1699
sumf = _mm512_reduce_add_ps ( acc0 ) + _mm512_reduce_add_ps ( acc1 );
1700
1700
#elif defined(__AVX2__ )
1701
- const size_t countBlocks = nb ;
1702
-
1703
1701
// Initialize accumulator with zeros
1704
1702
__m256 acc = _mm256_setzero_ps ();
1705
1703
@@ -5806,23 +5804,28 @@ static void ggml_compute_forward_mul_mat_f32(
5806
5804
const int ne02 = src0 -> ne [2 ];
5807
5805
const int ne03 = src0 -> ne [3 ];
5808
5806
5807
+ #if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS )
5809
5808
const int ne10 = src1 -> ne [0 ];
5809
+ #endif
5810
5810
const int ne11 = src1 -> ne [1 ];
5811
- //const int ne12 = src1->ne[2];
5812
- //const int ne13 = src1->ne[3];
5811
+ #ifndef NDEBUG
5812
+ const int ne12 = src1 -> ne [2 ];
5813
+ const int ne13 = src1 -> ne [3 ];
5813
5814
5814
- //const int ne0 = dst->ne[0];
5815
- //const int ne1 = dst->ne[1];
5816
- //const int ne2 = dst->ne[2];
5817
- //const int ne3 = dst->ne[3];
5818
- //const int ne = ne0*ne1*ne2*ne3;
5815
+ const int ne0 = dst -> ne [0 ];
5816
+ const int ne1 = dst -> ne [1 ];
5817
+ const int ne2 = dst -> ne [2 ];
5818
+ const int ne3 = dst -> ne [3 ];
5819
5819
5820
- //const int nb00 = src0->nb[0];
5820
+ const int nb00 = src0 -> nb [0 ];
5821
+ #endif
5821
5822
const int nb01 = src0 -> nb [1 ];
5822
5823
const int nb02 = src0 -> nb [2 ];
5823
5824
const int nb03 = src0 -> nb [3 ];
5824
5825
5826
+ #ifndef NDEBUG
5825
5827
const int nb10 = src1 -> nb [0 ];
5828
+ #endif
5826
5829
const int nb11 = src1 -> nb [1 ];
5827
5830
const int nb12 = src1 -> nb [2 ];
5828
5831
const int nb13 = src1 -> nb [3 ];
@@ -5840,8 +5843,9 @@ static void ggml_compute_forward_mul_mat_f32(
5840
5843
assert (ne2 == ne12 );
5841
5844
assert (ne3 == ne13 );
5842
5845
5843
- // TODO: we don't support permuted src0
5846
+ // we don't support permuted src0 or src1
5844
5847
assert (nb00 == sizeof (float ));
5848
+ assert (nb10 == sizeof (float ));
5845
5849
5846
5850
// dst cannot be transposed or permuted
5847
5851
assert (nb0 == sizeof (float ));
@@ -5859,8 +5863,6 @@ static void ggml_compute_forward_mul_mat_f32(
5859
5863
5860
5864
#if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS )
5861
5865
if (ggml_compute_forward_mul_mat_use_blas (src0 , src1 , dst )) {
5862
- GGML_ASSERT (nb10 == sizeof (float ));
5863
-
5864
5866
if (params -> ith != 0 ) {
5865
5867
return ;
5866
5868
}
@@ -5903,9 +5905,6 @@ static void ggml_compute_forward_mul_mat_f32(
5903
5905
return ;
5904
5906
}
5905
5907
5906
- // TODO: do not support transposed src1
5907
- assert (nb10 == sizeof (float ));
5908
-
5909
5908
// parallelize by src0 rows using ggml_vec_dot_f32
5910
5909
5911
5910
// total rows in src0
@@ -6169,7 +6168,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
6169
6168
const int ne1 = dst -> ne [1 ];
6170
6169
const int ne2 = dst -> ne [2 ];
6171
6170
const int ne3 = dst -> ne [3 ];
6172
- //const int ne = ne0*ne1*ne2*ne3;
6173
6171
6174
6172
const int nb00 = src0 -> nb [0 ];
6175
6173
const int nb01 = src0 -> nb [1 ];
@@ -6194,8 +6192,9 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
6194
6192
GGML_ASSERT (ne2 == ne12 );
6195
6193
GGML_ASSERT (ne3 == ne13 );
6196
6194
6197
- // TODO: we don't support permuted src0
6195
+ // we don't support permuted src0 or src1
6198
6196
GGML_ASSERT (nb00 == (int ) GGML_TYPE_SIZE [GGML_TYPE_Q4_0 ]);
6197
+ GGML_ASSERT (nb10 == sizeof (float ));
6199
6198
6200
6199
// dst cannot be transposed or permuted
6201
6200
GGML_ASSERT (nb0 == sizeof (float ));
@@ -6213,8 +6212,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
6213
6212
6214
6213
#if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS )
6215
6214
if (ggml_compute_forward_mul_mat_use_blas (src0 , src1 , dst )) {
6216
- GGML_ASSERT (nb10 == sizeof (float ));
6217
-
6218
6215
if (params -> ith != 0 ) {
6219
6216
return ;
6220
6217
}
@@ -6278,8 +6275,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
6278
6275
return ;
6279
6276
}
6280
6277
6281
- // TODO: do not support transposed src1
6282
-
6283
6278
// parallelize by src0 rows using ggml_vec_dot_q4_0
6284
6279
6285
6280
// total rows in src0
@@ -6354,7 +6349,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
6354
6349
const int ne1 = dst -> ne [1 ];
6355
6350
const int ne2 = dst -> ne [2 ];
6356
6351
const int ne3 = dst -> ne [3 ];
6357
- //const int ne = ne0*ne1*ne2*ne3;
6358
6352
6359
6353
const int nb00 = src0 -> nb [0 ];
6360
6354
const int nb01 = src0 -> nb [1 ];
@@ -6379,8 +6373,9 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
6379
6373
GGML_ASSERT (ne2 == ne12 );
6380
6374
GGML_ASSERT (ne3 == ne13 );
6381
6375
6382
- // TODO: we don't support permuted src0
6376
+ // we don't support permuted src0 or src1
6383
6377
GGML_ASSERT (nb00 == (int ) GGML_TYPE_SIZE [GGML_TYPE_Q4_1 ]);
6378
+ GGML_ASSERT (nb10 == sizeof (float ));
6384
6379
6385
6380
// dst cannot be transposed or permuted
6386
6381
GGML_ASSERT (nb0 == sizeof (float ));
@@ -6398,8 +6393,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
6398
6393
6399
6394
#if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS )
6400
6395
if (ggml_compute_forward_mul_mat_use_blas (src0 , src1 , dst )) {
6401
- GGML_ASSERT (nb10 == sizeof (float ));
6402
-
6403
6396
if (params -> ith != 0 ) {
6404
6397
return ;
6405
6398
}
@@ -6466,8 +6459,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
6466
6459
return ;
6467
6460
}
6468
6461
6469
- // TODO: do not support transposed src1
6470
-
6471
6462
// parallelize by src0 rows using ggml_vec_dot_q4_1
6472
6463
6473
6464
// total rows in src0
0 commit comments