-
Notifications
You must be signed in to change notification settings - Fork 0
/
maxmul.cu
69 lines (57 loc) · 1.85 KB
/
maxmul.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#include <stdio.h>
#include <cuda.h>
__global__ void vecmul(float *A, float *B, float *C, int size)
{
// Row and Column indexes:
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Are they bellow the maximum?
if (col < size && row < size)
{
float result = 0;
for (int ix = 0; ix < size; ix++)
{
result += A[row * size + ix] * B[ix * size + col];
}
C[row * size + col] = result;
}
}
extern "C"
{
int getThreadNum()
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
// printf("max thread num: %d\n", prop.maxThreadsPerBlock);
// printf("max grid dimensions: %d, %d, %d)\n",
// prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
return prop.maxThreadsPerBlock;
}
void maxmul(float *A, float *B, float *C, int size)
{
int threadNum = getThreadNum();
int blockNum = (size * size - 0.5) / threadNum + 1;
int total = size * size;
// Allocate device memory:
float *gpu_A;
float *gpu_B;
float *gpu_C;
int msize = total * sizeof(float);
cudaMalloc((void **)&gpu_A, msize);
cudaMemcpy(gpu_A, A, msize, cudaMemcpyHostToDevice);
cudaMalloc((void **)&gpu_B, msize);
cudaMemcpy(gpu_B, B, msize, cudaMemcpyHostToDevice);
cudaMalloc((void **)&gpu_C, msize);
// Blocks & grids:
dim3 blocks(threadNum, threadNum);
dim3 grid(blockNum, blockNum);
// Call the kernel:
vecmul<<<grid, blocks>>>(gpu_A, gpu_B, gpu_C, size);
// Get the result Matrix:
cudaMemcpy(C, gpu_C, msize, cudaMemcpyDeviceToHost);
// Free device matrices
cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_C);
}
}