Skip to content

Commit

Permalink
Merge pull request #209 from valassi/tput
Browse files Browse the repository at this point in the history
Improve throughput script + Results of further AOSOA tests
  • Loading branch information
valassi authored Jun 11, 2021
2 parents cec5045 + ebf8c6b commit 8a7c494
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ namespace Proc
{
#ifdef __CUDACC__
#ifndef MGONGPU_TEST_DIVERGENCE
// NB: opzxxx only reads pz (not E,px,py)
opzxxx( allmomenta, cHel[ihel][0], -1, w_sv[0], 0 );
//oxxxxx( allmomenta, 0, cHel[ihel][0], -1, w_sv[0], 0 ); // tested ok (much slower)
#else
Expand All @@ -127,6 +128,7 @@ namespace Proc
#endif

#ifdef __CUDACC__
// NB: imzxxx only reads pz (not E,px,py)
imzxxx( allmomenta, cHel[ihel][1], +1, w_sv[1], 1 );
//ixxxxx( allmomenta, 0, cHel[ihel][1], +1, w_sv[1], 1 ); // tested ok (slower)
#else
Expand All @@ -135,6 +137,7 @@ namespace Proc
#endif

#ifdef __CUDACC__
// NB: ixzxxx reads all E,px,py,pz
ixzxxx( allmomenta, cHel[ihel][2], -1, w_sv[2], 2 );
//ixxxxx( allmomenta, 0, cHel[ihel][2], -1, w_sv[2], 2 ); // tested ok (a bit slower)
#else
Expand All @@ -143,6 +146,7 @@ namespace Proc
#endif

#ifdef __CUDACC__
// NB: oxzxxx reads all E,px,py,pz
oxzxxx( allmomenta, cHel[ihel][3], +1, w_sv[3], 3 );
//oxxxxx( allmomenta, 0, cHel[ihel][3], +1, w_sv[3], 3 ); // tested ok (a bit slower)
#else
Expand Down Expand Up @@ -466,6 +470,7 @@ namespace Proc
#else
calculate_wavefunctions( ihel, allmomenta, allMEs, nevt );
#endif
//if ( ighel == 0 ) break; // TEST sectors/requests (issue #16)
}

// PART 2 - FINALISATION (after calculate_wavefunctions)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ cpp=1
ab3=0
ggttgg=0
div=0
req=0
detailed=0
verbose=0

function usage()
{
echo "Usage: $0 [-nocpp|[-omp][-avxall]] [-ep2] [-3a3b] [-ggttgg] [-div] [-v]"
echo "Usage: $0 [-nocpp|[-omp][-avxall]] [-ep2] [-3a3b] [-ggttgg] [-div] [-req] [-detailed] [-v]"
exit 1
}

Expand Down Expand Up @@ -43,6 +45,12 @@ while [ "$1" != "" ]; do
elif [ "$1" == "-div" ]; then
div=1
shift
elif [ "$1" == "-req" ]; then
req=1
shift
elif [ "$1" == "-detailed" ]; then
detailed=1
shift
elif [ "$1" == "-v" ]; then
verbose=1
shift
Expand Down Expand Up @@ -132,13 +140,16 @@ function runExe() {
if [ "${exe%%/gcheck*}" != "${exe}" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi
pattern="${pattern}|CUCOMPLEX"
pattern="${pattern}|COMMON RANDOM"
pattern="${pattern}|ERROR"
if [ "${ab3}" == "1" ]; then pattern="${pattern}|3a|3b"; fi
if [ "${req}" == "1" ]; then pattern="${pattern}|memory layout"; fi
if perf --version >& /dev/null; then
# -- Newer version using perf stat
pattern="${pattern}|instructions|cycles"
pattern="${pattern}|elapsed"
if [ "${detailed}" == "1" ]; then pattern="${pattern}|#"; fi
if [ "${verbose}" == "1" ]; then set -x; fi
perf stat $exe $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats"
perf stat -d $exe $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats"
set +x
else
# -- Older version using time
Expand All @@ -153,7 +164,7 @@ function runExe() {
function runNcu() {
exe=$1
args="$2"
###echo "runNcu $exe $args OMP=$OMP_NUM_THREADS"
###echo "runNcu $exe $args"
if [ "${verbose}" == "1" ]; then set -x; fi
$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}'
set +x
Expand All @@ -166,7 +177,7 @@ function runNcu() {
function runNcuDiv() {
exe=$1
args="-p 1 32 1"
###echo "runNcuDiv $exe $args OMP=$OMP_NUM_THREADS"
###echo "runNcuDiv $exe $args"
if [ "${verbose}" == "1" ]; then set -x; fi
###$(which ncu) --query-metrics $exe $args
###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args
Expand All @@ -176,10 +187,27 @@ function runNcuDiv() {
set +x
}

# Profiles sectors and requests
function runNcuReq() {
exe=$1
ncuArgs="$2"
if [ "${verbose}" == "1" ]; then set -x; fi
for args in "-p 1 1 1" "-p 1 4 1" "-p 1 8 1" "-p 1 32 1" "$ncuArgs"; do
###echo "runNcuReq $exe $args"
# NB This will print nothing if $args are invalid (eg "-p 1 4 1" when neppR=8)
$(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17, $19"s", $20, tag}'
done
set +x
}

if nvidia-smi -L > /dev/null 2>&1; then gpuTxt=$(nvidia-smi -L | awk '{print $3,$4,$5}'); else gpuTxt=none; fi
cpuTxt=$(cat /proc/cpuinfo | grep '^model name' | head -1 | awk '{i0=index($0,"Intel"); i1=index($0," @"); print substr($0,i0,i1-i0)}')
echo -e "\nOn $HOSTNAME [CPU: $cpuTxt] [GPU: $gpuTxt]:"

lastExe=
echo -e "\nOn $HOSTNAME ($(nvidia-smi -L | awk '{print $5}')):"
for exe in $exes; do
if [ ! -f $exe ]; then continue; fi
if [ "${exe%%/gcheck*}" != "${exe}" ] && [ "$gpuTxt" == "none" ]; then continue; fi
if [ "${exe%%/gg_ttgg*}" != "${exe}" ]; then
# This is a good GPU middle point: tput is 1.5x lower with "32 256 1", only a few% higher with "128 256 1"
exeArgs="-p 64 256 1"
Expand All @@ -205,9 +233,8 @@ for exe in $exes; do
fi
elif [ "${exe%%/gcheck*}" != "${exe}" ]; then
runNcu $exe "$ncuArgs"
if [ "${div}" == "1" ]; then
runNcuDiv $exe
fi
if [ "${div}" == "1" ]; then runNcuDiv $exe; fi
if [ "${req}" == "1" ]; then runNcuReq $exe "$ncuArgs"; fi
fi
done
echo "========================================================================="
1 change: 1 addition & 0 deletions epoch1/cuda/ee_mumu/src/mgOnGpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ namespace mgOnGpu
// *** NB Different values of neppR lead to different physics results: the ***
// *** same 1d array is generated, but it is interpreted in different ways ***
const int neppR = 8; // HARDCODED TO GIVE ALWAYS THE SAME PHYSICS RESULTS!
//const int neppR = 1; // AOS (tests of sectors/requests)

}

Expand Down

0 comments on commit 8a7c494

Please # to comment.