From ce008814f370200bc3c0d24cb22b903f6320ddbf Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Apr 2021 19:13:52 +0200 Subject: [PATCH] [klas3] add perf stat counters for a better analysis of AVX512 (issue #173) On itscrd03.cern.ch: ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CUDA [nvcc 11.0.221] FP precision = DOUBLE (NaN/abnormal=0, zero=0) EvtsPerSec[MatrixElems] (3) = ( 7.238520e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 0.781936 sec 2,522,542,287 cycles # 2.658 GHz 3,487,190,786 instructions # 1.38 insn per cycle 1.070591449 seconds time elapsed ==PROF== Profiling "sigmaKin": launch__registers_per_thread 120 ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 1.312061e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 7.151202 sec 19,150,190,925 cycles # 2.676 GHz 48,624,130,145 instructions # 2.54 insn per cycle 7.160649288 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 2.522426e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 4.855343 sec 12,990,550,722 cycles # 2.672 GHz 29,947,264,265 instructions # 2.31 insn per cycle 4.864907320 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3274) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 4.558120e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 3.701845 sec 9,362,892,708 cycles # 2.525 GHz 16,560,124,475 instructions # 1.77 insn per cycle 3.711390559 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2746) (512y: 0) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 4.913520e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 3.570205 sec 9,047,092,255 cycles # 2.529 GHz 16,496,830,998 instructions # 1.82 insn per cycle 3.580131314 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2572) (512y: 95) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 3.754219e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 3.983833 sec 8,829,184,043 cycles # 2.213 GHz 13,360,526,672 instructions # 1.51 insn per cycle 3.993458249 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1127) (512y: 205) (512z: 2045) ------------------------------------------------------------------------- --- .../P1_Sigma_sm_epem_mupmum/throughput12.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh index fa6f4d3e8a..9757f0fdb4 100755 --- a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh +++ b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh @@ -21,8 +21,8 @@ while [ "$1" != "" ]; do done exes= -exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.none/check.exe" exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.none/gcheck.exe" +exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.none/check.exe" if [ "${avxall}" == "1" ]; then exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.sse4/check.exe" exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.avx2/check.exe" @@ -32,8 +32,8 @@ if [ "${avxall}" == "1" ]; then exes="$exes ../../../../../epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/build.512z/check.exe" fi if [ "${ep2}" == "1" ]; then - exes="$exes ../../../../../epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.exe" exes="$exes ../../../../../epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/gcheck.exe" + exes="$exes ../../../../../epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.exe" fi export USEBUILDDIR=1 @@ -58,8 +58,13 @@ function runExe() { # Optionally add other patterns here for some specific configurations (e.g. clang) pattern="${pattern}|CUCOMPLEX" pattern="${pattern}|COMMON RANDOM" + # -- Older version using time # For TIMEFORMAT see https://www.gnu.org/software/bash/manual/html_node/Bash-Variables.html - TIMEFORMAT=$'real\t%3lR' && time $exe -p 2048 256 12 2>&1 | egrep "(${pattern})" + ###TIMEFORMAT=$'real\t%3lR' && time $exe -p 2048 256 12 2>&1 | egrep "(${pattern})" + # -- Newer version using perf stat + pattern="${pattern}|instructions|cycles" + pattern="${pattern}|elapsed" + perf stat $exe -p 2048 256 12 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats" } function runNcu() {