-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathbridge.h
291 lines (235 loc) · 9.22 KB
/
bridge.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#ifndef BRIDGE_H
#define BRIDGE_H
// includes from Cuda/C++ matrix element calculations
#include "CPPProcess.h"
#include "Memory.h"
#include "mgOnGpuTypes.h"
#include <cassert>
#include <cstring>
#include <iostream>
#include <memory>
// those should become fortran parameters passed in here
const int gpublocks = 1; // 1024;
const int gputhreads = 16; // 256;
// Forward declare transposition kernel
#ifdef __CUDACC__
template <typename T>
__global__ void dev_transpose(const T *in, T *out, const int evt,
const int part, const int mome, const int strd);
#else
template <typename T>
void hst_transpose(const T *in, T *out, const int evt, const int part,
const int mome, const int strd);
#endif // __CUDACC__
// *****************************************************************************
/**
* A templated class for calling the C++ / Cuda matrix element calculations of
* the event generation workflow. The template parameter is used for the
* precision of the calculations (float or double)
*
* The fortran momenta passed in are in the form of
* DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, NB_PAGE)
* where the dimensions are <# momenta>, <# of particles>, <# events>
*/
template <typename T> class Bridge {
public:
/**
* class constructor
*
* @param evt number of events (NB_PAGE, vector.inc)
* @param par number of particles / event (NEXTERNAL, nexternal.inc)
* @param mom number of momenta / particle
* @param str stride length
* @param ncomb number of good helicities (ncomb, mgOnGpuConfig.h)
*/
Bridge(int evt, int par, int mom, int str, int ncomb);
/**
* sequence to be executed for the Cuda matrix element calculation
*
* @param momenta memory address of the input 4-momenta
* @param mes memory address of the output matrix elements
*/
void gpu_sequence(T *momenta, double *mes);
/**
* sequence to be executed for the vectorized CPU matrix element calculation
*
* @param momenta memory address of the input 4-momenta
* @param mes memory address of the output matrix elements
*/
void cpu_sequence(T *momenta, double *mes);
private:
int m_evt; ///< number of events
int m_part; ///< number of particles / event
int m_mome; ///< number of momenta / particle (usually 4)
int m_strd; ///< stride length of the AOSOA structure
int m_ncomb; ///< number of good helicities
bool m_goodHelsCalculated; ///< have the good helicities been calculated?
#ifdef __CUDACC__
typedef std::unique_ptr<bool[], CudaHstDeleter<bool>> CuBHPtr;
typedef std::unique_ptr<bool, CudaDevDeleter<bool>> CuBDPtr;
typedef std::unique_ptr<T[], CudaHstDeleter<T>> CuTHPtr;
typedef std::unique_ptr<T, CudaDevDeleter<T>> CuTDPtr;
CuTDPtr devMomentaF = devMakeUnique<T>(m_evt * m_part * m_mome);
CuTDPtr devMomentaC = devMakeUnique<T>(m_evt * m_part * m_mome);
CuBHPtr hstIsGoodHel = hstMakeUnique<bool>(m_ncomb);
CuBDPtr devIsGoodHel = devMakeUnique<bool>(m_ncomb);
CuTDPtr devMEs = devMakeUnique<T>(m_evt);
#else
// This needs to be inside the function, why?
// typedef std::unique_ptr<T[], CppHstDeleter<T>> CpTHPtr;
// typedef std::unique_ptr<bool[], CppHstDeleter<bool>> CpBHPtr;
// CpTHPtr hstMomenta = hstMakeUnique<T>(m_evt * m_part * m_mome);
// CpBHPtr hstIsGoodHel2 = hstMakeUnique<bool>(m_ncomb);
// CpTHPtr hstMEs = hstMakeUnique<T>(m_evt);
#endif
};
// *****************************************************************************
//
// Implementations of class Bridge member functions
//
template <typename T>
Bridge<T>::Bridge(int evnt, int part, int mome, int strd, int ncomb)
: m_evt(evnt), m_part(part), m_mome(mome), m_strd(strd), m_ncomb(ncomb),
m_goodHelsCalculated(false) {
#ifdef __CUDACC__
gProc::CPPProcess process(1, gpublocks, gputhreads, false);
#else
Proc::CPPProcess process(1, gpublocks, gputhreads, false);
#endif // __CUDACC__
process.initProc("../../Cards/param_card.dat");
}
#ifdef __CUDACC__
template <typename T> void Bridge<T>::gpu_sequence(T *momenta, double *mes) {
checkCuda(cudaMemcpy(devMomentaF.get(), momenta,
m_evt * m_part * m_mome * sizeof(T),
cudaMemcpyHostToDevice));
dev_transpose<<<gpublocks * 16, gputhreads>>>(
devMomentaF.get(), devMomentaC.get(), m_evt, m_part, m_mome, m_strd);
if (!m_goodHelsCalculated) {
gProc::sigmaKin_getGoodHel<<<gpublocks, gputhreads>>>(
devMomentaC.get(), devMEs.get(), devIsGoodHel.get());
checkCuda(cudaMemcpy(hstIsGoodHel.get(), devIsGoodHel.get(),
m_ncomb * sizeof(bool), cudaMemcpyDeviceToHost));
gProc::sigmaKin_setGoodHel(hstIsGoodHel.get());
m_goodHelsCalculated = true;
}
gProc::sigmaKin<<<gpublocks, gputhreads>>>(devMomentaC.get(), devMEs.get());
checkCuda(
cudaMemcpy(mes, devMEs.get(), m_evt * sizeof(T), cudaMemcpyDeviceToHost));
}
#else
template <typename T> void Bridge<T>::cpu_sequence(T *momenta, double *mes) {
// should become class members...
typedef std::unique_ptr<T[], CppHstDeleter<T>> CpTHPtr;
typedef std::unique_ptr<bool[], CppHstDeleter<bool>> CpBHPtr;
CpTHPtr hstMomenta = hstMakeUnique<T>(m_evt * m_part * m_mome);
CpBHPtr hstIsGoodHel2 = hstMakeUnique<bool>(m_ncomb);
CpTHPtr hstMEs = hstMakeUnique<T>(m_evt);
// double(&hstMEs2)[m_evt] = reinterpret_cast<double(&)[m_evt]>(mes);
hst_transpose(momenta, hstMomenta.get(), m_evt, m_part, m_mome, m_strd);
if (!m_goodHelsCalculated) {
Proc::sigmaKin_getGoodHel(hstMomenta.get(), hstMEs.get(),
hstIsGoodHel2.get(), m_evt);
Proc::sigmaKin_setGoodHel(hstIsGoodHel2.get());
m_goodHelsCalculated = true;
}
Proc::sigmaKin(hstMomenta.get(), hstMEs.get(), m_evt);
memcpy(mes, hstMEs.get(), sizeof(T) * m_evt);
}
#endif // __CUDACC__
// *****************************************************************************
//
// Implementations of transposition functions
//
#ifdef __CUDACC__
/**
const int evnt_n = 4; // the number of events
const int part_n = 4; // number of in/out particles inside an event
const int mome_n = 3; // number of momenta of one particle (usually 4)
const int strd_n = 2; // stride length for aosoa data (# adjacent events)
const int array_bytes = evnt_n * part_n * mome_n * sizeof(T);
*/
template <typename T>
__global__ void dev_transpose(const T *in, T *out, const int evt,
const int part, const int mome, const int strd) {
int pos = blockDim.x * blockIdx.x + threadIdx.x;
int arrlen = evt * part * mome;
if (pos < arrlen) {
int page_i = pos / (strd * mome * part);
int rest_1 = pos % (strd * mome * part);
int part_i = rest_1 / (strd * mome);
int rest_2 = rest_1 % (strd * mome);
int mome_i = rest_2 / strd;
int strd_i = rest_2 % strd;
int inpos = (page_i * strd + strd_i) // event number
* (part * mome) // event size (pos of event)
+ part_i * mome // particle inside event
+ mome_i; // momentum inside particle
out[pos] = in[inpos];
}
}
#else
template <typename T>
void hst_transpose(const T *in, T *out, const int evt, const int part,
const int mome, const int strd) {
int arrlen = evt * part * mome;
for (int pos = 0; pos < arrlen; ++pos) {
int page_i = pos / (strd * mome * part);
int rest_1 = pos % (strd * mome * part);
int part_i = rest_1 / (strd * mome);
int rest_2 = rest_1 % (strd * mome);
int mome_i = rest_2 / strd;
int strd_i = rest_2 % strd;
int inpos = (page_i * strd + strd_i) // event number
* (part * mome) // event size (pos of event)
+ part_i * mome // particle inside event
+ mome_i; // momentum inside particle
out[pos] = in[inpos];
}
}
#endif // __CUDACC__
// *****************************************************************************
//
// BACKUP
//
// debug
// std::cout << std::string(80, '*') << std::endl << "Momenta:" << std::endl;
// T *aosoa_p = (T *)hstMomenta.get();
// for (int i = 0; i < m_evt * m_part * m_mome; ++i) {
// if (i && i % m_strd == 0)
// std::cout << ", ";
// if (i && i % (m_mome * m_strd) == 0)
// std::cout << std::endl;
// if (i && i % (m_part * m_mome * m_strd) == 0)
// std::cout << std::endl;
// std::cout << aosoa_p[i] << " ";
// }
// std::cout << std::endl << std::string(80, '*') << std::endl;
// template <typename T> void Matrix<T>::fill(T *arr) {
//
// T(*aos)
// [m_part][m_mome] = (T(*)[m_part][m_mome])arr; // was ->
// m_hstInpArray.get();
//
// for (int i = 0; i < m_evt; ++i) {
// for (int j = 0; j < m_part; ++j) {
// for (int k = 0; k < m_mome; ++k) {
// aos[i][j][k] = (i + 1) * 100 + (j + 1) * 10 + (k + 1);
// }
// }
// }
//
// #ifdef DEBUG
// std::cout << std::string(80, '*') << std::endl;
// T *aos_p = (T *)arr; // was -> m_hstInpArray.get();
// for (int i = 0; i < m_evt * m_part * m_mome; ++i) {
// if (i && i % m_mome == 0)
// std::cout << std::endl;
// if (i && i % (m_mome * m_part) == 0)
// std::cout << std::endl;
// std::cout << aos_p[i] << " ";
// }
// std::cout << std::endl;
// #endif // DEBUG
// }
#endif // BRIDGE_H