@@ -13952,7 +13952,7 @@ static void ggml_compute_forward_mul_mat_sparse_head(
13952
13952
13953
13953
int64_t ir010 = dr0*ith0;
13954
13954
// const int64_t ir011 = MIN(ir010 + dr0, nr0);
13955
- const int64_t ir011 = ir010 + dr0;
13955
+ // const int64_t ir011 = ir010 + dr0;
13956
13956
13957
13957
const int64_t ir110 = dr1*ith1;
13958
13958
const int64_t ir111 = MIN(ir110 + dr1, nr1);
@@ -13969,13 +13969,13 @@ static void ggml_compute_forward_mul_mat_sparse_head(
13969
13969
assert(ne13 % ne03 == 0);
13970
13970
13971
13971
// block-tiling attempt
13972
- const int64_t blck_0 = 16;
13972
+ // const int64_t blck_0 = 16;
13973
13973
const int64_t blck_1 = 16;
13974
13974
13975
13975
// attempt to reduce false-sharing (does not seem to make a difference)
13976
- float tmp[16];
13976
+ // float tmp[16];
13977
13977
float *ffdata = (float *)dst->src[2]->data;
13978
- int *gid = (int *)dst->src[3]->data;
13978
+ // int *gid = (int *)dst->src[3]->data;
13979
13979
while(true) {
13980
13980
ir010 = atomic_fetch_add(params->aic, dr0);
13981
13981
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
@@ -14210,12 +14210,12 @@ static void ggml_compute_forward_mul_mat_sparse(
14210
14210
assert(ne13 % ne03 == 0);
14211
14211
14212
14212
// block-tiling attempt
14213
- const int64_t blck_0 = 16;
14213
+ // const int64_t blck_0 = 16;
14214
14214
const int64_t blck_1 = 16;
14215
- int total = 0;
14215
+ // int total = 0;
14216
14216
14217
14217
// attempt to reduce false-sharing (does not seem to make a difference)
14218
- float tmp[16];
14218
+ // float tmp[16];
14219
14219
float *ffdata = (float *)dst->src[2]->data;
14220
14220
int *gid = (int *)dst->src[3]->data;
14221
14221
float *predictor_data = (float *)dst->src[2]->data;
@@ -14291,13 +14291,14 @@ static void ggml_compute_forward_mul_mat_sparse(
14291
14291
}
14292
14292
14293
14293
// vz = alpha * vx + vy
14294
- static void ggml_axpy_normal_f16(const int n, const ggml_fp16_t * vx, const ggml_fp16_t * restrict vy, const void* restrict vz, ggml_fp16_t alpha) {
14294
+ static void ggml_axpy_normal_f16(const int n, const ggml_fp16_t * vx, const ggml_fp16_t * restrict vy, void* restrict vz, ggml_fp16_t alpha) {
14295
14295
float *res = (float *)vz;
14296
14296
for (int i = 0; i < n; i++) {
14297
14297
res[i] = res[i] + (GGML_FP16_TO_FP32(vx[i])*GGML_FP16_TO_FP32(alpha));
14298
14298
}
14299
+ (void) vy;
14299
14300
}
14300
- static void ggml_axpy_avx_f16(const int n, const ggml_fp16_t * restrict vx, const ggml_fp16_t * restrict vy, void* restrict vz, ggml_fp16_t alpha) {
14301
+ static void ggml_axpy_avx_f16(const int n, const ggml_fp16_t * restrict vx, const ggml_fp16_t * vy, void* vz, ggml_fp16_t alpha) {
14301
14302
#if defined(__AVX2__)
14302
14303
float *result = (float *)vz;
14303
14304
float alpha_f32 = GGML_FP16_TO_FP32(alpha);
@@ -14316,7 +14317,7 @@ static void ggml_axpy_avx_f16(const int n, const ggml_fp16_t * restrict vx, cons
14316
14317
res[i] = res[i] + (GGML_FP16_TO_FP32(vx[i])*alpha_convert);
14317
14318
}
14318
14319
#endif
14319
-
14320
+ (void)vy;
14320
14321
}
14321
14322
atomic_flag g_axpy_dense_lock = ATOMIC_FLAG_INIT;
14322
14323
static void ggml_compute_forward_mul_mat_axpy_dense(
@@ -14329,14 +14330,14 @@ static void ggml_compute_forward_mul_mat_axpy_dense(
14329
14330
14330
14331
GGML_TENSOR_BINARY_OP_LOCALS;
14331
14332
14332
- const int ith = params->ith;
14333
+ // const int ith = params->ith;
14333
14334
const int nth = params->nth;
14334
14335
14335
14336
const enum ggml_type type = src0->type;
14336
14337
14337
- const bool src1_cont = ggml_is_contiguous(src1);
14338
+ // const bool src1_cont = ggml_is_contiguous(src1);
14338
14339
14339
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14340
+ // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14340
14341
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
14341
14342
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
14342
14343
@@ -14356,8 +14357,8 @@ static void ggml_compute_forward_mul_mat_axpy_dense(
14356
14357
GGML_ASSERT(nb2 <= nb3);
14357
14358
14358
14359
// broadcast factors
14359
- const int64_t r2 = ne12/ne02;
14360
- const int64_t r3 = ne13/ne03;
14360
+ // const int64_t r2 = ne12/ne02;
14361
+ // const int64_t r3 = ne13/ne03;
14361
14362
14362
14363
// nb01 >= nb00 - src0 is not transposed
14363
14364
// compute by src0 rows
@@ -14387,7 +14388,7 @@ static void ggml_compute_forward_mul_mat_axpy_dense(
14387
14388
}
14388
14389
14389
14390
ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
14390
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
14391
+ // const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
14391
14392
14392
14393
struct ggml_tensor *src2 = dst->src[2];
14393
14394
@@ -14399,15 +14400,15 @@ static void ggml_compute_forward_mul_mat_axpy_dense(
14399
14400
// const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]);
14400
14401
14401
14402
// src1 rows
14402
- const int64_t nr1 = ne11*ne12*ne13;
14403
+ // const int64_t nr1 = ne11*ne12*ne13;
14403
14404
// float *idx = src2->data;
14404
14405
// int *gid = (int *)(dst->src[3]->data);
14405
14406
// printf("down %d up %d ne00 %d\n", ir10, ir11, ne00);
14406
14407
14407
14408
float vec[ne00*4];
14408
14409
void *vy = vec;
14409
14410
memset(vy, 0, ne00*4);
14410
- char* src0_row = (const char *) src0->data;
14411
+ char* src0_row = (char *) src0->data;
14411
14412
while(true) {
14412
14413
const int ir0 = atomic_fetch_add(params->aic, dr);
14413
14414
for (int64_t ir1 = ir0; ir1 < ir0+dr; ir1++) {
@@ -14417,7 +14418,7 @@ static void ggml_compute_forward_mul_mat_axpy_dense(
14417
14418
// if (idx[ir1] < 0.0f)
14418
14419
// continue;
14419
14420
// ggml_axpy_normal_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]);
14420
- ggml_axpy_avx_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]);
14421
+ ggml_axpy_avx_f16(ne00, (ggml_fp16_t *)( src0_row+nb01*ir1), (ggml_fp16_t *) vy, vy, wdata[ir1]);
14421
14422
}
14422
14423
if (ir0 + dr >= nr)
14423
14424
break;
@@ -14475,9 +14476,9 @@ static void ggml_compute_forward_mul_mat_axpy(
14475
14476
14476
14477
const enum ggml_type type = src0->type;
14477
14478
14478
- const bool src1_cont = ggml_is_contiguous(src1);
14479
+ // const bool src1_cont = ggml_is_contiguous(src1);
14479
14480
14480
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14481
+ // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14481
14482
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
14482
14483
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
14483
14484
@@ -14497,8 +14498,8 @@ static void ggml_compute_forward_mul_mat_axpy(
14497
14498
GGML_ASSERT(nb2 <= nb3);
14498
14499
14499
14500
// broadcast factors
14500
- const int64_t r2 = ne12/ne02;
14501
- const int64_t r3 = ne13/ne03;
14501
+ // const int64_t r2 = ne12/ne02;
14502
+ // const int64_t r3 = ne13/ne03;
14502
14503
14503
14504
// nb01 >= nb00 - src0 is not transposed
14504
14505
// compute by src0 rows
@@ -14550,7 +14551,7 @@ static void ggml_compute_forward_mul_mat_axpy(
14550
14551
14551
14552
float vec[ne00*4];
14552
14553
void *vy = vec;
14553
- char* src0_row = (const char *) src0->data;
14554
+ char* src0_row = (char *) src0->data;
14554
14555
ggml_fp16_t * src1_ptr = NULL;
14555
14556
for (int col_idx = 0; col_idx < nr1; col_idx++) {
14556
14557
src1_ptr = (ggml_fp16_t *)((char *)wdata + col_idx * row_size);
@@ -14571,7 +14572,7 @@ static void ggml_compute_forward_mul_mat_axpy(
14571
14572
if (idx[ir1] < -0.0f)
14572
14573
continue;
14573
14574
// ggml_axpy_normal_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]);
14574
- ggml_axpy_avx_f16(ne00, src0_row+nb01*ir1, vy, vy, src1_ptr[ir1]);
14575
+ ggml_axpy_avx_f16(ne00, (ggml_fp16_t *)( src0_row+nb01*ir1), (ggml_fp16_t *) vy, vy, src1_ptr[ir1]);
14575
14576
}
14576
14577
14577
14578
// 获取锁
@@ -14625,9 +14626,9 @@ static void ggml_compute_forward_mul_mat_axpy_q4_0(
14625
14626
14626
14627
const enum ggml_type type = src0->type;
14627
14628
14628
- const bool src1_cont = ggml_is_contiguous(src1);
14629
+ // const bool src1_cont = ggml_is_contiguous(src1);
14629
14630
14630
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14631
+ // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14631
14632
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
14632
14633
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
14633
14634
@@ -14647,8 +14648,8 @@ static void ggml_compute_forward_mul_mat_axpy_q4_0(
14647
14648
GGML_ASSERT(nb2 <= nb3);
14648
14649
14649
14650
// broadcast factors
14650
- const int64_t r2 = ne12/ne02;
14651
- const int64_t r3 = ne13/ne03;
14651
+ // const int64_t r2 = ne12/ne02;
14652
+ // const int64_t r3 = ne13/ne03;
14652
14653
14653
14654
// nb01 >= nb00 - src0 is not transposed
14654
14655
// compute by src0 rows
@@ -14698,10 +14699,10 @@ static void ggml_compute_forward_mul_mat_axpy_q4_0(
14698
14699
14699
14700
float vec[ne00*4];
14700
14701
void *vy = vec;
14701
- char* src0_row = (const char *) src0->data;
14702
+ char* src0_row = (char *) src0->data;
14702
14703
for (int col_idx = 0; col_idx < nr1; col_idx++) {
14703
14704
// const block_q8_0 * restrict nerual = wdata;
14704
- const block_q8_0 *restrict nerual = ((char *)wdata + col_idx * row_size);
14705
+ const block_q8_0 *restrict nerual = (block_q8_0 *)( (char *)wdata + col_idx * row_size);
14705
14706
idx = (float *)((char *)src2->data + col_idx * idx_row_size);
14706
14707
memset(vy, 0, ne00 * 4);
14707
14708
// while(true) {
@@ -14774,14 +14775,14 @@ static void ggml_compute_forward_mul_mat_axpy_head(
14774
14775
14775
14776
GGML_TENSOR_BINARY_OP_LOCALS;
14776
14777
14777
- const int ith = params->ith;
14778
- const int nth = params->nth;
14778
+ // const int ith = params->ith;
14779
+ // const int nth = params->nth;
14779
14780
14780
14781
const enum ggml_type type = src0->type;
14781
14782
14782
- const bool src1_cont = ggml_is_contiguous(src1);
14783
+ // const bool src1_cont = ggml_is_contiguous(src1);
14783
14784
14784
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14785
+ // ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
14785
14786
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
14786
14787
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
14787
14788
@@ -14801,8 +14802,8 @@ static void ggml_compute_forward_mul_mat_axpy_head(
14801
14802
GGML_ASSERT(nb2 <= nb3);
14802
14803
14803
14804
// broadcast factors
14804
- const int64_t r2 = ne12/ne02;
14805
- const int64_t r3 = ne13/ne03;
14805
+ // const int64_t r2 = ne12/ne02;
14806
+ // const int64_t r3 = ne13/ne03;
14806
14807
14807
14808
// nb01 >= nb00 - src0 is not transposed
14808
14809
// compute by src0 rows
@@ -14832,7 +14833,7 @@ static void ggml_compute_forward_mul_mat_axpy_head(
14832
14833
}
14833
14834
14834
14835
const ggml_fp16_t* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
14835
- const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
14836
+ // const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
14836
14837
14837
14838
struct ggml_tensor *src2 = dst->src[2];
14838
14839
int chunk = ne00 / 32;
@@ -14845,15 +14846,15 @@ static void ggml_compute_forward_mul_mat_axpy_head(
14845
14846
// const int64_t ir11 = MIN(ir10 + dr, src2->ne[0]);
14846
14847
14847
14848
// src1 rows
14848
- const int64_t nr1 = ne11*ne12*ne13;
14849
- float *idx = src2->data;
14850
- int *gid = (int *)(dst->src[3]->data);
14849
+ // const int64_t nr1 = ne11*ne12*ne13;
14850
+ // float *idx = src2->data;
14851
+ // int *gid = (int *)(dst->src[3]->data);
14851
14852
// printf("down %d up %d ne00 %d\n", ir10, ir11, ne00);
14852
14853
14853
14854
float vec[ne00*4];
14854
14855
void *vy = vec;
14855
14856
memset(vy, 0, ne00*4);
14856
- char* src0_row = (const char *) src0->data;
14857
+ char* src0_row = (char *) src0->data;
14857
14858
while (true) {
14858
14859
const int ir0 = atomic_fetch_add(params->aic, dr);
14859
14860
// int id = ir0 >> 7;
@@ -14862,7 +14863,7 @@ static void ggml_compute_forward_mul_mat_axpy_head(
14862
14863
for (int64_t ir1 = ir0; ir1 < ir0+dr; ir1++) {
14863
14864
if (ir1 >= nr) break;
14864
14865
// ggml_axpy_normal_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]);
14865
- ggml_axpy_avx_f16(ne00, src0_row+nb01*ir1, vy, vy, wdata[ir1]);
14866
+ ggml_axpy_avx_f16(ne00, (ggml_fp16_t *)( src0_row+nb01*ir1), (ggml_fp16_t *) vy, vy, wdata[ir1]);
14866
14867
}
14867
14868
if (ir0 + dr >= nr)
14868
14869
break;
@@ -15746,6 +15747,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15746
15747
GGML_ASSERT(false); // TODO: not implemented
15747
15748
} break;
15748
15749
case GGML_OP_MUL_MAT:
15750
+ case GGML_OP_AXPY:
15749
15751
{
15750
15752
// https://cs231n.github.io/optimization-2/#staged
15751
15753
// # forward pass
@@ -16737,20 +16739,7 @@ static void ggml_graph_compute_perf_stats_node_gpu(struct ggml_tensor * node, co
16737
16739
node->perf_cycles += cycles_cur;
16738
16740
node->perf_time_us += time_us_cur;
16739
16741
}
16740
- void busy_wait_cycles(int cycles) {
16741
- struct timespec ts_start, ts_end;
16742
-
16743
- clock_gettime(CLOCK_MONOTONIC, &ts_start);
16744
16742
16745
- while (1) {
16746
- clock_gettime(CLOCK_MONOTONIC, &ts_end);
16747
- long diff_ns = (ts_end.tv_sec - ts_start.tv_sec) * 1000000000 +
16748
- (ts_end.tv_nsec - ts_start.tv_nsec);
16749
- if (diff_ns >= cycles) {
16750
- break;
16751
- }
16752
- }
16753
- }
16754
16743
16755
16744
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16756
16745
int n_tasks = 0;
@@ -17164,8 +17153,8 @@ static thread_ret_t ggml_graph_compute_thread_hybrid(void * data) {
17164
17153
/*.type =*/GGML_TASK_COMPUTE,
17165
17154
/*.ith =*/0,
17166
17155
/*.nth =*/1,
17167
- /*.wsize =*/NULL ,
17168
- /*.wdata =*/NULL ,
17156
+ /*.wsize =*/0 ,
17157
+ /*.wdata =*/0 ,
17169
17158
/*.aic =*/0,
17170
17159
};
17171
17160
0 commit comments