Skip to content

Commit

Permalink
[klas3] add perf stat counters for a better analysis of AVX512 (issue m…
Browse files Browse the repository at this point in the history
…adgraph5#173)

On itscrd03.cern.ch:
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CUDA [nvcc 11.0.221]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
EvtsPerSec[MatrixElems] (3) = ( 7.238520e+08                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     0.781936 sec
     2,522,542,287      cycles                    #    2.658 GHz
     3,487,190,786      instructions              #    1.38  insn per cycle
       1.070591449 seconds time elapsed
==PROF== Profiling "sigmaKin": launch__registers_per_thread 120
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 1.312061e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     7.151202 sec
    19,150,190,925      cycles                    #    2.676 GHz
    48,624,130,145      instructions              #    2.54  insn per cycle
       7.160649288 seconds time elapsed
=Symbols in CPPProcess.o= (~sse4:  614) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 2.522426e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     4.855343 sec
    12,990,550,722      cycles                    #    2.672 GHz
    29,947,264,265      instructions              #    2.31  insn per cycle
       4.864907320 seconds time elapsed
=Symbols in CPPProcess.o= (~sse4: 3274) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 4.558120e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.701845 sec
     9,362,892,708      cycles                    #    2.525 GHz
    16,560,124,475      instructions              #    1.77  insn per cycle
       3.711390559 seconds time elapsed
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2746) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 4.913520e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.570205 sec
     9,047,092,255      cycles                    #    2.529 GHz
    16,496,830,998      instructions              #    1.82  insn per cycle
       3.580131314 seconds time elapsed
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2572) (512y:   95) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 3.754219e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.983833 sec
     8,829,184,043      cycles                    #    2.213 GHz
    13,360,526,672      instructions              #    1.51  insn per cycle
       3.993458249 seconds time elapsed
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1127) (512y:  205) (512z: 2045)
-------------------------------------------------------------------------
  • Loading branch information
valassi committed Apr 27, 2021
1 parent 39f4ac3 commit ce00881
Showing 1 changed file with 8 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ while [ "$1" != "" ]; do
done

exes=
exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.none/check.exe"
exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.none/gcheck.exe"
exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.none/check.exe"
if [ "${avxall}" == "1" ]; then
exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.sse4/check.exe"
exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.avx2/check.exe"
Expand All @@ -32,8 +32,8 @@ if [ "${avxall}" == "1" ]; then
exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.512z/check.exe"
fi
if [ "${ep2}" == "1" ]; then
exes="$exes ../../../../../epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.exe"
exes="$exes ../../../../../epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/gcheck.exe"
exes="$exes ../../../../../epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.exe"
fi

export USEBUILDDIR=1
Expand All @@ -58,8 +58,13 @@ function runExe() {
# Optionally add other patterns here for some specific configurations (e.g. clang)
pattern="${pattern}|CUCOMPLEX"
pattern="${pattern}|COMMON RANDOM"
# -- Older version using time
# For TIMEFORMAT see https://www.gnu.org/software/bash/manual/html_node/Bash-Variables.html
TIMEFORMAT=$'real\t%3lR' && time $exe -p 2048 256 12 2>&1 | egrep "(${pattern})"
###TIMEFORMAT=$'real\t%3lR' && time $exe -p 2048 256 12 2>&1 | egrep "(${pattern})"
# -- Newer version using perf stat
pattern="${pattern}|instructions|cycles"
pattern="${pattern}|elapsed"
perf stat $exe -p 2048 256 12 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats"
}

function runNcu() {
Expand Down

0 comments on commit ce00881

Please # to comment.