Skip to content

Commit

Permalink
[klas3] vectorize the final calcualations in sigmakin (issue #178)
Browse files Browse the repository at this point in the history
This squeezes one (last?) % of performance. Now 4.91E6, was 4.87E6.

On itscrd70.cern.ch:
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 1.307604e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     7.176768 sec
real    0m7.186s
=Symbols in CPPProcess.o= (~sse4:  614) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CUDA [nvcc 11.0.221]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
EvtsPerSec[MatrixElems] (3) = ( 7.341151e+08                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     0.731415 sec
real    0m1.031s
==PROF== Profiling "sigmaKin": launch__registers_per_thread 120
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 2.500771e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     4.879965 sec
real    0m4.890s
=Symbols in CPPProcess.o= (~sse4: 3274) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 4.600406e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.653161 sec
real    0m3.663s
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2746) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 4.909408e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.581065 sec
real    0m3.591s
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2572) (512y:   95) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 3.736380e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.995692 sec
real    0m4.006s
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1127) (512y:  205) (512z: 2045)
-------------------------------------------------------------------------
Process                     = EPOCH2_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 1.145752e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     7.854923 sec
real    0m7.865s
=Symbols in CPPProcess.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH2_EEMUMU_CUDA [nvcc 11.0.221]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
EvtsPerSec[MatrixElems] (3) = ( 7.451168e+08                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     0.734622 sec
real    0m1.027s
==PROF== Profiling "sigmaKin": launch__registers_per_thread 164
-------------------------------------------------------------------------
  • Loading branch information
valassi committed Apr 27, 2021
1 parent 1659aa7 commit da19d3c
Showing 1 changed file with 21 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -424,18 +424,22 @@ namespace Proc
#endif

// PART 0 - INITIALISATION (before calculate_wavefunctions)
// Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
// FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?)
#ifdef MGONGPU_CPPSIMD
const int npagV = nevt/neppV;
for ( int ipagV = 0; ipagV < npagV; ++ipagV )
{
allMEs[ipagV] = fptype_v{0}; // all zeros
}
#else
#ifndef __CUDACC__
for ( int ievt = 0; ievt < nevt; ++ievt )
#endif
{
// Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
// FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?)
#ifndef MGONGPU_CPPSIMD
allMEs[ievt] = 0; // all zeros
#else
allMEs[ievt/neppV][ievt%neppV] = 0; // all zeros
#endif
}
#endif

// PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
// (in both CUDA and C++, using precomputed good helicities)
Expand All @@ -450,20 +454,23 @@ namespace Proc
}

// PART 2 - FINALISATION (after calculate_wavefunctions)
// Get the final |M|^2 as an average over helicities/colors of running sum of |M|^2 over helicities for the given event
// [NB 'sum over final spins, average over initial spins', eg see
// https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
// FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?)
#ifdef MGONGPU_CPPSIMD
for ( int ipagV = 0; ipagV < npagV; ++ipagV )
{
allMEs[ipagV] /= denominators;
}
#else
#ifndef __CUDACC__
for ( int ievt = 0; ievt < nevt; ++ievt )
#endif
{
// Get the final |M|^2 as an average over helicities/colors of running sum of |M|^2 over helicities for the given event
// [NB 'sum over final spins, average over initial spins', eg see
// https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
// FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?)
#ifndef MGONGPU_CPPSIMD
allMEs[ievt] /= denominators;
#else
allMEs[ievt/neppV][ievt%neppV] /= denominators;
#endif
}
#endif
mgDebugFinalise();
}

Expand Down

0 comments on commit da19d3c

Please # to comment.