Revert "[klas3] use 512y as default AVX also on clang (issue madgraph…

…5#182)" This reverts commit 50c12e5. Keep AVX2 on clang for the moment as this is actually faster than gcc! Lastet baseline performance on gcc with cxtype_ref (issue madgraph5#172): On itscrd70.cern.ch: ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 1.306401e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 7.185372 sec real 0m7.195s =Symbols in CPPProcess.o= (~sse4: 614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CUDA [nvcc 11.0.221] FP precision = DOUBLE (NaN/abnormal=0, zero=0) EvtsPerSec[MatrixElems] (3) = ( 7.265629e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 0.742860 sec real 0m1.035s ==PROF== Profiling "sigmaKin": launch__registers_per_thread 120 ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 2.504491e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 4.867292 sec real 0m4.878s =Symbols in CPPProcess.o= (~sse4: 3274) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 4.592850e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 3.656376 sec real 0m3.666s =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2746) (512y: 0) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 4.916156e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 3.581543 sec real 0m3.591s =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2572) (512y: 95) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH1_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 3.705860e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 4.008144 sec real 0m4.018s =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1127) (512y: 205) (512z: 2045) ------------------------------------------------------------------------- Process = EPOCH2_EEMUMU_CPP [gcc (GCC) 9.2.0] FP precision = DOUBLE (NaN/abnormal=0, zero=0) OMP threads / `nproc --all` = 1 / 4 EvtsPerSec[MatrixElems] (3) = ( 1.147166e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 7.840836 sec real 0m7.850s =Symbols in CPPProcess.o= (~sse4: 567) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- Process = EPOCH2_EEMUMU_CUDA [nvcc 11.0.221] FP precision = DOUBLE (NaN/abnormal=0, zero=0) EvtsPerSec[MatrixElems] (3) = ( 7.417572e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 0.735049 sec real 0m1.027s ==PROF== Profiling "sigmaKin": launch__registers_per_thread 164 -------------------------------------------------------------------------
valassi · Apr 27, 2021 · fbebc25 · fbebc25
1 parent 50c12e5
commit fbebc25
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 6 deletions.
diff --git a/epoch1/cuda/ee_mumu/SubProcesses/Makefile b/epoch1/cuda/ee_mumu/SubProcesses/Makefile
@@ -14,12 +14,16 @@ CXX     ?= g++
 # AVX choice (example: "make AVX=none")
 ifneq ($(AVX),)
 ###$(info Using AVX='$(AVX)' according to user input)
-else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
+else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
 override AVX = 512y
 ###$(info Using AVX='$(AVX)' as no user input exists)
 else
 override AVX = avx2
+ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
 $(warning Using AVX='$(AVX)' because host does not support avx512vl)
+else
+$(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang)
+endif
 endif
 ###$(info AVX=$(AVX))
 
@@ -29,9 +33,9 @@ endif
 ifeq ($(AVX),sse4)
 override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
 else ifeq ($(AVX),avx2)
-override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers)
+override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
 else ifeq ($(AVX),512y)
-override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT]
+override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
 else ifeq ($(AVX),512z)
 override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
 else ifneq ($(AVX),none)

diff --git a/epoch1/cuda/ee_mumu/src/Makefile b/epoch1/cuda/ee_mumu/src/Makefile
@@ -10,12 +10,16 @@ CXX     ?= g++
 # AVX choice (example: "make AVX=none")
 ifneq ($(AVX),)
 ###$(info Using AVX='$(AVX)' according to user input)
-else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
+else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
 override AVX = 512y
 ###$(info Using AVX='$(AVX)' as no user input exists)
 else
 override AVX = avx2
+ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
 $(warning Using AVX='$(AVX)' because host does not support avx512vl)
+else
+$(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang)
+endif
 endif
 ###$(info AVX=$(AVX))
 
@@ -25,9 +29,9 @@ endif
 ifeq ($(AVX),sse4)
 override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
 else ifeq ($(AVX),avx2)
-override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers)
+override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
 else ifeq ($(AVX),512y)
-override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT]
+override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
 else ifeq ($(AVX),512z)
 override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
 else ifneq ($(AVX),none)