-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathlatencykernel.cl
135 lines (126 loc) · 4.13 KB
/
latencykernel.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
// unrolled until terascale no longer saw further improvement (10x unroll)
// assumes count will be a multiple of 10. but it won't be too inaccurate with a big count
// not divisible by 10
__kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) {
int current = A[0];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
__kernel void unrolled_latency_test_amdvectorworkaround(__global const int* A, int count, __global int* ret) {
int start = A[1 + get_local_id(0)];
int current = A[start];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
// latency test like the unrolled one above, but with input as constant memory
__kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {
int current = A[0];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
#define local_mem_test_size 1024
// uses local memory (LDS/shmem)
__kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) {
__local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?
// better be fast
for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// everyone else can chill/get masked off
if (get_local_id(0) == 0) {
int current = local_a[0];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
}
ret[0] = result;
}
}
__kernel void dummy_add(__global int* A) {
A[get_global_id(0)]++;
}