@@ -172,14 +172,14 @@ static cublasHandle_t cublasH = NULL;
172
172
static cudaStream_t cudaStream = NULL ;
173
173
static void init_cublas (void ) {
174
174
if (cublasH == NULL ) {
175
- /* step 1: create cublas handle, bind a stream */
175
+ // create cublas handle, bind a stream
176
176
CUBLAS_CHECK (cublasCreate (& cublasH ));
177
177
178
178
CUDA_CHECK (cudaStreamCreateWithFlags (& cudaStream , cudaStreamNonBlocking ));
179
179
CUBLAS_CHECK (cublasSetStream (cublasH , cudaStream ));
180
180
181
181
// configure logging to stdout
182
- //CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
182
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
183
183
}
184
184
}
185
185
#endif
@@ -7336,19 +7336,19 @@ static void ggml_compute_forward_mul_mat_f32(
7336
7336
float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
7337
7337
7338
7338
#if defined(GGML_USE_CUBLAS )
7339
- /* step 2: copy data to device */
7339
+ // copy data to device
7340
7340
CUDA_CHECK (cudaMemcpyAsync (d_X , x , sizeof (float ) * x_ne , cudaMemcpyHostToDevice , cudaStream ));
7341
7341
CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof (float ) * y_ne , cudaMemcpyHostToDevice , cudaStream ));
7342
7342
7343
- /* step 3: compute */
7343
+ // compute
7344
7344
CUBLAS_CHECK (
7345
7345
cublasSgemm (cublasH , CUBLAS_OP_T , CUBLAS_OP_N ,
7346
7346
ne01 , ne11 , ne10 ,
7347
7347
& alpha , d_X , ne00 ,
7348
7348
d_Y , ne10 ,
7349
7349
& beta , d_D , ne01 ));
7350
7350
7351
- /* step 4: copy data to host */
7351
+ // copy data to host
7352
7352
CUDA_CHECK (cudaMemcpyAsync (d , d_D , sizeof (float ) * d_ne , cudaMemcpyDeviceToHost , cudaStream ));
7353
7353
CUDA_CHECK (cudaStreamSynchronize (cudaStream ));
7354
7354
#else
@@ -7362,7 +7362,6 @@ static void ggml_compute_forward_mul_mat_f32(
7362
7362
}
7363
7363
}
7364
7364
#if defined(GGML_USE_CUBLAS )
7365
- /* free resources */
7366
7365
CUDA_CHECK (cudaFree (d_X ));
7367
7366
CUDA_CHECK (cudaFree (d_Y ));
7368
7367
CUDA_CHECK (cudaFree (d_D ));
@@ -7533,7 +7532,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
7533
7532
for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
7534
7533
for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
7535
7534
#if defined(GGML_USE_CUBLAS )
7536
- // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
7535
+ // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
7537
7536
{
7538
7537
size_t id = 0 ;
7539
7538
for (int64_t i01 = 0 ; i01 < ne11 ; ++ i01 ) {
@@ -7559,11 +7558,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
7559
7558
7560
7559
float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
7561
7560
7562
- /* step 2: copy data to device */
7561
+ // copy data to device
7563
7562
CUDA_CHECK (cudaMemcpyAsync (d_X , x , sizeof (ggml_fp16_t ) * x_ne , cudaMemcpyHostToDevice , cudaStream ));
7564
7563
CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof (ggml_fp16_t ) * y_ne , cudaMemcpyHostToDevice , cudaStream ));
7565
7564
7566
- /* step 3: compute */
7565
+ // compute
7567
7566
CUBLAS_CHECK (
7568
7567
cublasGemmEx (cublasH , CUBLAS_OP_T , CUBLAS_OP_N ,
7569
7568
ne01 , ne11 , ne10 ,
@@ -7573,7 +7572,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
7573
7572
CUBLAS_COMPUTE_32F ,
7574
7573
CUBLAS_GEMM_DEFAULT ));
7575
7574
7576
- /* step 4: copy data to host */
7575
+ // copy data to host
7577
7576
CUDA_CHECK (cudaMemcpyAsync (d , d_D , sizeof (float ) * d_ne , cudaMemcpyDeviceToHost , cudaStream ));
7578
7577
CUDA_CHECK (cudaStreamSynchronize (cudaStream ));
7579
7578
#else
@@ -7593,7 +7592,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
7593
7592
}
7594
7593
7595
7594
#if defined(GGML_USE_CUBLAS )
7596
- /* free resources */
7597
7595
CUDA_CHECK (cudaFree (d_X ));
7598
7596
CUDA_CHECK (cudaFree (d_Y ));
7599
7597
CUDA_CHECK (cudaFree (d_D ));
@@ -7797,19 +7795,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
7797
7795
float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
7798
7796
7799
7797
#if defined(GGML_USE_CUBLAS )
7800
- /* step 2: copy data to device */
7798
+ // copy data to device
7801
7799
CUDA_CHECK (cudaMemcpyAsync (d_X , x , sizeof (float ) * x_ne , cudaMemcpyHostToDevice , cudaStream ));
7802
7800
CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof (float ) * y_ne , cudaMemcpyHostToDevice , cudaStream ));
7803
7801
7804
- /* step 3: compute */
7802
+ // compute
7805
7803
CUBLAS_CHECK (
7806
7804
cublasSgemm (cublasH , CUBLAS_OP_T , CUBLAS_OP_N ,
7807
7805
ne01 , ne11 , ne10 ,
7808
7806
& alpha , d_X , ne00 ,
7809
7807
d_Y , ne10 ,
7810
7808
& beta , d_D , ne01 ));
7811
7809
7812
- /* step 4: copy data to host */
7810
+ // copy data to host
7813
7811
CUDA_CHECK (cudaMemcpyAsync (d , d_D , sizeof (float ) * d_ne , cudaMemcpyDeviceToHost , cudaStream ));
7814
7812
CUDA_CHECK (cudaStreamSynchronize (cudaStream ));
7815
7813
#else
@@ -7824,7 +7822,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
7824
7822
}
7825
7823
7826
7824
#if defined(GGML_USE_CUBLAS )
7827
- /* free resources */
7828
7825
CUDA_CHECK (cudaFree (d_X ));
7829
7826
CUDA_CHECK (cudaFree (d_Y ));
7830
7827
CUDA_CHECK (cudaFree (d_D ));
0 commit comments