-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathspectralnorm.c
255 lines (209 loc) · 5.89 KB
/
spectralnorm.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/* The Computer Language Benchmarks Game
* http://benchmarksgame.alioth.debian.org/
*
* Original C contributed by Sebastien Loisel
* Conversion to C++ by Jon Harrop
* OpenMP parallelize by The Anh Tran
* Add SSE by The Anh Tran
* Reconversion into C by Dan Farina
*/
#define _GNU_SOURCE
#include <omp.h>
#include <math.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#define false 0
#define true 1
/* define SIMD data type. 2 doubles encapsulated in one XMM register */
typedef double v2dt __attribute__((vector_size(16)));
static const v2dt v1 = {1.0, 1.0};
/* parameter for evaluate functions */
struct Param
{
double* u; /* source vector */
double* tmp; /* temporary */
double* v; /* destination vector */
int N; /* source/destination vector length */
int N2; /* = N/2 */
int r_begin; /* working range of each thread */
int r_end;
};
/* Return: 1.0 / (i + j) * (i + j +1) / 2 + i + 1; */
static double
eval_A(int i, int j)
{
/*
* 1.0 / (i + j) * (i + j +1) / 2 + i + 1;
* n * (n+1) is even number. Therefore, just (>> 1) for (/2)
*/
int d = (((i+j) * (i+j+1)) >> 1) + i+1;
return 1.0 / d;
}
/*
* Return type: 2 doubles in xmm register [double1, double2]
* double1 = 1.0 / (i + j) * (i + j +1) / 2 + i + 1;
* double2 = 1.0 / (i+1 + j) * (i+1 + j +1) / 2 + i+1 + 1;
*/
static v2dt
eval_A_i(int i, int j)
{
int d1 = (((i+j) * (i+j+1)) >> 1) + i+1;
int d2 = (((i+1 +j) * (i+1 +j+1)) >> 1) + (i+1) +1;
v2dt r = {d1, d2};
return v1 / r;
}
/*
* Return type: 2 doubles in xmm register [double1, double2]
* double1 = 1.0 / (i + j) * (i + j +1) / 2 + i + 1;
* double2 = 1.0 / (i + j+1) * (i + j+1 +1) / 2 + i + 1;
*/
static v2dt
eval_A_j(int i, int j)
{
int d1 = (((i+j) * (i+j+1)) >> 1) + i+1;
int d2 = (((i+ j+1) * (i+ j+1 +1)) >> 1) + i+1;
v2dt r = {d1, d2};
return v1 / r;
}
/* This function is called by many threads */
static void
eval_A_times_u(struct Param *p)
{
/* alias of source vector */
const v2dt *pU = (void *) p->u;
int i;
int ie;
for (i = p->r_begin, ie = p->r_end; i < ie; i++)
{
v2dt sum = {0, 0};
/* xmm = 2 doubles. This loop run from [0 .. N/2) */
int j;
for (j = 0; j < p->N2; j++)
sum += pU[j] * eval_A_j(i, j*2);
/* write result */
{
double *mem = (void *) ∑
p->tmp[i] = mem[0] + mem[1];
}
/* If source vector is odd size. This should be called <= 1 time */
for (j = j*2; __builtin_expect(j < p->N, false); j++)
p->tmp[i] += eval_A(i, j) * p->u[j];
}
}
static void
eval_At_times_u(struct Param *p)
{
const v2dt *pT = (void *) p->tmp;
int i;
int ie;
for (i = p->r_begin, ie = p->r_end; i < ie; i++)
{
v2dt sum = {0, 0};
int j;
for (j = 0; j < p->N2; j++)
sum += pT[j] * eval_A_i(j*2, i);
{
double *mem = (void *) ∑
p->v[i] = mem[0] + mem[1];
}
/* odd size array */
for (j = j*2; __builtin_expect(j < p->N, false); j++)
p->v[i] += eval_A(j, i) * p->tmp[j];
}
}
/*
* Called by N threads.
*
* Each thread modifies its portion in destination vector -> barrier needed to
* sync access
*/
static void
eval_AtA_times_u(struct Param *p)
{
eval_A_times_u(p);
#pragma omp barrier
eval_At_times_u(p);
#pragma omp barrier
}
/*
* Shootout bench uses affinity to emulate single core processor. This
* function searches for appropriate number of threads to spawn.
*/
static int
GetThreadCount()
{
cpu_set_t cs;
int i;
int count = 0;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);
for (i = 0; i < 16; i++)
if (CPU_ISSET(i, &cs))
count++;
return count;
}
static double
spectral_game(int N)
{
/* Align 64 byte for L2 cache line */
__attribute__((aligned(64))) double u[N];
__attribute__((aligned(64))) double tmp[N];
__attribute__((aligned(64))) double v[N];
double vBv = 0.0;
double vv = 0.0;
#pragma omp parallel default(shared) num_threads(GetThreadCount())
{
int i;
#pragma omp for schedule(static)
for (i = 0; i < N; i++)
u[i] = 1.0;
/*
* this block will be executed by NUM_THREADS variable declared in this
* block is private for each thread
*/
int threadid = omp_get_thread_num();
int threadcount = omp_get_num_threads();
int chunk = N / threadcount;
int ite;
struct Param my_param;
my_param.tmp = tmp;
my_param.N = N;
my_param.N2 = N/2;
/*
* calculate each thread's working range [range1 .. range2) => static
* schedule here
*/
my_param.r_begin = threadid * chunk;
my_param.r_end = (threadid < (threadcount -1)) ?
(my_param.r_begin + chunk) : N;
for (ite = 0; ite < 10; ite++)
{
my_param.u = u; /* source vec is u */
my_param.v = v; /* destination vec is v */
eval_AtA_times_u(&my_param);
my_param.u = v; /* source is v */
my_param.v = u; /* destination is u */
eval_AtA_times_u(&my_param);
}
/* multi thread adding */
{
int i;
#pragma omp for schedule(static) reduction( + : vBv, vv ) nowait
for (i = 0; i < N; i++)
{
vv += v[i] * v[i];
vBv += u[i] * v[i];
}
}
}
/* end parallel region */
return sqrt(vBv/vv);
}
int
main(int argc, char *argv[])
{
int N = ((argc >= 2) ? atoi(argv[1]) : 2000);
printf("%.9f\n", spectral_game(N));
return 0;
}