-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKoMaL_S_141.cu
106 lines (78 loc) · 2.84 KB
/
KoMaL_S_141.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <iostream>
#include <stdio.h>
#include <chrono>
#define THREADS_PER_BLOCK 384
// Atomic add for double (available from CUDA 11.2)
__device__ double atomicAdd_double(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__global__ void normal_kernel(uint64_t M, uint64_t N, uint64_t P, double* result) {
double Nd = (double)N;
double sum = 0;
for (uint64_t i = 0; i < M; ++i)
{
sum = sum + Nd / (P + i);
}
*result = sum;
}
__global__ void parallel_kernel(uint64_t M, uint64_t N, uint64_t P, double* result) {
double Nd = (double)N;
double local_sum = 0;
for (int64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
local_sum += Nd / (P + i);
}
// Szálak közötti eredmények összegzése atomikus művelettel
__shared__ double shared_sum;
if (threadIdx.x == 0) {
shared_sum = 0;
}
__syncthreads();
atomicAdd_double(&shared_sum, local_sum);
__syncthreads();
if (threadIdx.x == 0) {
atomicAdd_double(result, shared_sum);
}
}
int main()
{
uint64_t M, N, P;
printf("N: "); scanf("%lu", &N);
printf("M: "); scanf("%lu", &M);
printf("P: "); scanf("%lu", &P);
printf("\t\tSum\t\tTime\n");
// Normal version
{
auto start = std::chrono::high_resolution_clock::now();
double* dev_result;
cudaMalloc((void**)&dev_result, sizeof(double));
normal_kernel<<<1, 1>>>(M, N, P, dev_result);
double result;
cudaMemcpy(&result, dev_result, sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(dev_result);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
printf("Normal\t\t%f\t%10.6f\n", result, elapsed.count());
}
// Parallel version
{
auto start = std::chrono::high_resolution_clock::now();
double* dev_result;
cudaMalloc((void**)&dev_result, sizeof(double));
cudaMemset(dev_result, 0, sizeof(double));
int num_blocks = (M + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
parallel_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(M, N, P, dev_result);
double result;
cudaMemcpy(&result, dev_result, sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(dev_result);
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
printf("Parallel\t%f\t%10.6f\n", result, elapsed.count());
}
return 0;
}