@@ -19,23 +19,28 @@ static void cmd_analyze(struct ggml_mulmat_tune *tune);
19
19
static void usage (char * prog ) {
20
20
const char * usage_lines [] = {
21
21
"usage: %s [bench ...] | [analyze FILE] [-h | --help]\n" ,
22
- "\n" ,
23
- "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]\n" ,
24
- "--model MODEL 7B | 13B | 30B | 65B\n" ,
25
- " default 7B\n" ,
26
- "--type TYPE Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16\n" ,
27
- " default Q4_0\n" ,
28
- "--m_num M_NUM number of M, the max M = 2^(M_NUM-1)\n" ,
29
- " requires: in range [8, 12]\n" ,
30
- " default 10\n" ,
31
- "--backend BACKEND backend: CUDA | CL | BLAS\n" ,
32
- " default: auto detect\n" ,
33
- "--n_pass number of passes to run\n" ,
34
- " default 3\n" ,
35
- " requires: in range [1, 5]\n" ,
36
- "--file FILE data file to write\n" ,
37
- " default stdout\n" ,
38
- "-y always answer \"yes\" to all prompts\n" ,
22
+ "" ,
23
+ "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]" ,
24
+ "--model MODEL 3B | 7B | 13B | 30B | 65B" ,
25
+ " default 7B" ,
26
+ "--type TYPE Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16" ,
27
+ " default Q4_0" ,
28
+ "--m_num M_NUM number of M, the max M = 2^(M_NUM-1)" ,
29
+ " requires within [8, 12]" ,
30
+ " default 10" ,
31
+ "--backend BACKEND backend: CUDA | CL | BLAS" ,
32
+ " default: auto detect" ,
33
+ "--n_pass number of passes to run" ,
34
+ " default 3" ,
35
+ " requires: within [1, 5]" ,
36
+ "--file FILE data file to write" ,
37
+ " default stdout" ,
38
+ "--hint enable hint" ,
39
+ " run less bench for constant or linear stages." ,
40
+ " CAUTION: hint is experimental and the resulting" ,
41
+ " data may be unreliable, enable it only" ,
42
+ " if you know what you are doing" ,
43
+ "-y always answer \"yes\" to all prompts" ,
39
44
};
40
45
41
46
int len = (int )(sizeof (usage_lines ) / sizeof (char * ));
@@ -44,7 +49,7 @@ static void usage(char *prog) {
44
49
if (i == 0 ) {
45
50
fprintf (stderr , line , prog );
46
51
} else {
47
- fprintf (stderr , "%s" , line );
52
+ fprintf (stderr , "%s\n " , line );
48
53
}
49
54
}
50
55
@@ -74,7 +79,7 @@ int main(int argc, char **argv) {
74
79
75
80
if (strcmp (cmd , "bench" ) == 0 ) {
76
81
struct ggml_mulmat_tune tune = {
77
- .version = 1 ,
82
+ .version = GGML_MULMAT_TUNE_VERSION ,
78
83
.n_shapes = 0 ,
79
84
};
80
85
@@ -124,6 +129,9 @@ int main(int argc, char **argv) {
124
129
arg_file = argv [i + 1 ];
125
130
++ i ;
126
131
}
132
+ } else if (strcmp (argv [i ], "--hint" ) == 0 ) {
133
+ fprintf (stderr , "The `hint` feature is not implemented\n" );
134
+ exit (1 );
127
135
} else if (strcmp (argv [i ], "-y" ) == 0 ) {
128
136
always_yes = true;
129
137
} else {
@@ -196,7 +204,7 @@ int main(int argc, char **argv) {
196
204
m_num = v ;
197
205
}
198
206
if (m_num < 8 || m_num > 12 ) {
199
- fprintf (stderr , "invalid m_num: %d, expect in range [8, 12]\n" ,
207
+ fprintf (stderr , "invalid m_num: %d, expect within [8, 12]\n" ,
200
208
m_num );
201
209
usage (argv [0 ]);
202
210
exit (1 );
@@ -209,8 +217,8 @@ int main(int argc, char **argv) {
209
217
int v = atoi (arg_n_pass );
210
218
n_pass = v ;
211
219
}
212
- if (n_pass < 1 || n_pass > MAX_NUM_PASS ) {
213
- fprintf (stderr , "invalid n_pass: %d, expect in range [1, 5]\n" ,
220
+ if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS ) {
221
+ fprintf (stderr , "invalid n_pass: %d, expect within [1, 5]\n" ,
214
222
n_pass );
215
223
usage (argv [0 ]);
216
224
exit (1 );
@@ -350,7 +358,7 @@ int main(int argc, char **argv) {
350
358
351
359
void cmd_bench (struct ggml_mulmat_tune * tune , int n_pass , bool verbose ) {
352
360
size_t wsize = 0 ;
353
- void * q_buf = NULL ;
361
+ char hist [ 64 ]; // TODO: make sure this size is safe.
354
362
void * wdata = NULL ;
355
363
356
364
// alloc q_buf and wdata with max size.
@@ -364,16 +372,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
364
372
}
365
373
GGML_ASSERT (max_NxK > 0 );
366
374
367
- // NOTE: proximate.
368
- size_t q_buf_size = max_NxK * sizeof (int64_t );
369
-
370
- q_buf = malloc (q_buf_size );
371
- if (!q_buf ) {
372
- fprintf (stderr ,
373
- "failed to allocate memory for q_buf, size: %zu MiB\n" ,
374
- q_buf_size / 1024 / 1024 );
375
- exit (1 );
376
- }
377
375
wsize = max_NxK * sizeof (float );
378
376
wdata = malloc (wsize );
379
377
if (!wdata ) {
@@ -449,23 +447,23 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
449
447
switch (shape -> src0_type ) {
450
448
case GGML_TYPE_Q4_0 :
451
449
ggml_quantize_q4_0 ((const float * )src0_f32 -> data ,
452
- src0 -> data , N * K , K , (int64_t * )q_buf );
450
+ src0 -> data , N * K , K , (void * )hist );
453
451
break ;
454
452
case GGML_TYPE_Q4_1 :
455
453
ggml_quantize_q4_1 ((const float * )src0_f32 -> data ,
456
- src0 -> data , N * K , K , (int64_t * )q_buf );
454
+ src0 -> data , N * K , K , (void * )hist );
457
455
break ;
458
456
case GGML_TYPE_Q5_0 :
459
457
ggml_quantize_q5_0 ((const float * )src0_f32 -> data ,
460
- src0 -> data , N * K , K , (int64_t * )q_buf );
458
+ src0 -> data , N * K , K , (void * )hist );
461
459
break ;
462
460
case GGML_TYPE_Q5_1 :
463
461
ggml_quantize_q5_1 ((const float * )src0_f32 -> data ,
464
- src0 -> data , N * K , K , (int64_t * )q_buf );
462
+ src0 -> data , N * K , K , (void * )hist );
465
463
break ;
466
464
case GGML_TYPE_Q8_0 :
467
465
ggml_quantize_q8_0 ((const float * )src0_f32 -> data ,
468
- src0 -> data , N * K , K , (int64_t * )q_buf );
466
+ src0 -> data , N * K , K , (void * )hist );
469
467
break ;
470
468
default :
471
469
GGML_ASSERT (false);
@@ -492,7 +490,7 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
492
490
// without memset, the first run may be significant slow.
493
491
memset (wdata , 0 , wsize );
494
492
495
- int stage_time [MAX_NUM_PASS ];
493
+ int stage_time [GGML_MULMAT_MAX_PASS ];
496
494
for (int i_bench = 0 ; i_bench < n_pass ; i_bench ++ ) {
497
495
int t0 = (int )ggml_time_us ();
498
496
@@ -529,7 +527,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
529
527
}
530
528
531
529
free (wdata );
532
- free (q_buf );
533
530
}
534
531
535
532
static void print_build_tips (void ) {
0 commit comments