Skip to content

Commit

Permalink
Revert "[klas3] use 512y as default AVX also on clang (issue madgraph…
Browse files Browse the repository at this point in the history
…5#182)"

This reverts commit 50c12e5.

Keep AVX2 on clang for the moment as this is actually faster than gcc!

Lastet baseline performance on gcc with cxtype_ref (issue madgraph5#172):

On itscrd70.cern.ch:
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 1.306401e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     7.185372 sec
real    0m7.195s
=Symbols in CPPProcess.o= (~sse4:  614) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CUDA [nvcc 11.0.221]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
EvtsPerSec[MatrixElems] (3) = ( 7.265629e+08                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     0.742860 sec
real    0m1.035s
==PROF== Profiling "sigmaKin": launch__registers_per_thread 120
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 2.504491e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     4.867292 sec
real    0m4.878s
=Symbols in CPPProcess.o= (~sse4: 3274) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 4.592850e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.656376 sec
real    0m3.666s
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2746) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 4.916156e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     3.581543 sec
real    0m3.591s
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 2572) (512y:   95) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 3.705860e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     4.008144 sec
real    0m4.018s
=Symbols in CPPProcess.o= (~sse4:    0) (avx2: 1127) (512y:  205) (512z: 2045)
-------------------------------------------------------------------------
Process                     = EPOCH2_EEMUMU_CPP [gcc (GCC) 9.2.0]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
OMP threads / `nproc --all` = 1 / 4
EvtsPerSec[MatrixElems] (3) = ( 1.147166e+06                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     7.840836 sec
real    0m7.850s
=Symbols in CPPProcess.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
-------------------------------------------------------------------------
Process                     = EPOCH2_EEMUMU_CUDA [nvcc 11.0.221]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
EvtsPerSec[MatrixElems] (3) = ( 7.417572e+08                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     0.735049 sec
real    0m1.027s
==PROF== Profiling "sigmaKin": launch__registers_per_thread 164
-------------------------------------------------------------------------
  • Loading branch information
valassi committed Apr 27, 2021
1 parent 50c12e5 commit fbebc25
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 6 deletions.
10 changes: 7 additions & 3 deletions epoch1/cuda/ee_mumu/SubProcesses/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,16 @@ CXX ?= g++
# AVX choice (example: "make AVX=none")
ifneq ($(AVX),)
###$(info Using AVX='$(AVX)' according to user input)
else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
override AVX = 512y
###$(info Using AVX='$(AVX)' as no user input exists)
else
override AVX = avx2
ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
$(warning Using AVX='$(AVX)' because host does not support avx512vl)
else
$(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang)
endif
endif
###$(info AVX=$(AVX))

Expand All @@ -29,9 +33,9 @@ endif
ifeq ($(AVX),sse4)
override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
else ifeq ($(AVX),avx2)
override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers)
override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
else ifeq ($(AVX),512y)
override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT]
override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
else ifeq ($(AVX),512z)
override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
else ifneq ($(AVX),none)
Expand Down
10 changes: 7 additions & 3 deletions epoch1/cuda/ee_mumu/src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@ CXX ?= g++
# AVX choice (example: "make AVX=none")
ifneq ($(AVX),)
###$(info Using AVX='$(AVX)' according to user input)
else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
override AVX = 512y
###$(info Using AVX='$(AVX)' as no user input exists)
else
override AVX = avx2
ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
$(warning Using AVX='$(AVX)' because host does not support avx512vl)
else
$(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang)
endif
endif
###$(info AVX=$(AVX))

Expand All @@ -25,9 +29,9 @@ endif
ifeq ($(AVX),sse4)
override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
else ifeq ($(AVX),avx2)
override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers)
override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
else ifeq ($(AVX),512y)
override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT]
override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
else ifeq ($(AVX),512z)
override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
else ifneq ($(AVX),none)
Expand Down

0 comments on commit fbebc25

Please # to comment.