From cacd48d06ded758bb449613b8e91ed716db7e034 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 09:32:43 +0100 Subject: [PATCH 01/42] [omp] in gg_tt.mad reenable OpenMP MT in cudacpp #575 --- .../gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc | 7 +++---- .../cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc | 1 + epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 9 +++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 07d46022a9..f92c5bfc3e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -909,16 +909,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) From 917209161b20ea0177166676ad3ffb902c3951c6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 09:40:34 +0100 Subject: [PATCH 02/42] [omp] in CODEGEN backport reenabling of OMP MT #575 --- .../madgraph/iolibs/template_files/gpu/check_sa.cc | 1 + .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 9 +++++---- .../template_files/gpu/process_sigmaKin_function.inc | 7 +++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 17bfe9587a..3c5f0f2c23 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 65c9b66945..3818b0db04 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -96,16 +96,15 @@ #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) From 912020d7d36c0b79b820f74efd9cb3ae4600767c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 09:41:45 +0100 Subject: [PATCH 03/42] [omp] regenerate ggtt mad - all stable --- .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index de741c70c2..323a382e71 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -2,6 +2,7 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode +('WARNING: loading of madgraph too slow!!!', 1.538149356842041) ************************************************************ * * * W E L C O M E to * @@ -57,7 +58,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006824970245361328  +DEBUG: model prefixing takes 0.006787538528442383  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +170,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -204,8 +205,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.133 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s +Wrote files for 10 helas calls in 0.162 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines @@ -214,7 +215,7 @@ ALOHA: aloha creates 2 routines in 0.175 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.159 s +ALOHA: aloha creates 4 routines in 0.244 s VVV1 FFV1 FFV1 @@ -238,6 +239,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.259s -user 0m1.938s -sys 0m0.294s +real 0m7.626s +user 0m1.972s +sys 0m0.333s From 805544481a617d1b624595a3a486aaf08f87f50c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 09:50:48 +0100 Subject: [PATCH 04/42] [omp] regenerate the other 4 processes mad --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 14 ++++++------- .../SubProcesses/P1_ll_ll/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_ll_ll/check_sa.cc | 1 + .../ee_mumu.mad/SubProcesses/cudacpp.mk | 9 +++++---- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 16 +++++++-------- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_gg_ttxg/check_sa.cc | 1 + .../gg_ttg.mad/SubProcesses/cudacpp.mk | 9 +++++---- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 18 ++++++++--------- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_gg_ttxgg/check_sa.cc | 1 + .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 9 +++++---- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 20 +++++++++---------- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_gg_ttxggg/check_sa.cc | 1 + .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 9 +++++---- 16 files changed, 70 insertions(+), 66 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index e17039aa4e..a9e90bce22 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -57,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068399906158447266  +DEBUG: model prefixing takes 0.006835222244262695  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -168,7 +168,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_ll_ll -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -206,14 +206,14 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.240 s +ALOHA: aloha creates 3 routines in 0.267 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.305 s +ALOHA: aloha creates 7 routines in 0.323 s FFV1 FFV1 FFV2 @@ -241,6 +241,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.425s -user 0m2.110s -sys 0m0.293s +real 0m2.496s +user 0m2.127s +sys 0m0.289s diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc index db5d51ec93..702a9a5a97 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc @@ -901,16 +901,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index afd9c1ba7d..8dfa6e6ea2 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006835460662841797  +DEBUG: model prefixing takes 0.00680088996887207  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -206,15 +206,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.049 s -Wrote files for 36 helas calls in 0.202 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.048 s +Wrote files for 36 helas calls in 0.203 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.391 s +ALOHA: aloha creates 5 routines in 0.446 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -222,7 +222,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.373 s +ALOHA: aloha creates 10 routines in 0.380 s VVV1 VVV1 FFV1 @@ -251,6 +251,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.901s +real 0m2.976s user 0m2.579s -sys 0m0.300s +sys 0m0.313s diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index aec1546376..201b78594d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1130,16 +1130,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 8f7775d3a3..321d5a9829 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068187713623046875  +DEBUG: model prefixing takes 0.006788969039916992  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -208,15 +208,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.548 s -Wrote files for 222 helas calls in 0.939 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.546 s +Wrote files for 222 helas calls in 0.928 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.399 s +ALOHA: aloha creates 5 routines in 0.397 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -224,7 +224,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.382 s +ALOHA: aloha creates 10 routines in 0.377 s VVV1 VVV1 FFV1 @@ -256,6 +256,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m4.515s -user 0m4.159s -sys 0m0.331s +real 0m4.506s +user 0m4.192s +sys 0m0.292s diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index b71abdc468..24a304017f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -3076,16 +3076,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index a11e9984a3..5631ab8bd7 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068476200103759766  +DEBUG: model prefixing takes 0.0068318843841552734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.401 s +1 processes with 1240 diagrams generated in 2.396 s Total: 1 processes with 1240 diagrams output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -171,7 +171,7 @@ INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1592 term in 40s. Introduce 2768 contraction -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -212,15 +212,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 8.322 s -Wrote files for 2281 helas calls in 53.571 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 8.326 s +Wrote files for 2281 helas calls in 53.171 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.387 s +ALOHA: aloha creates 5 routines in 0.455 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -228,7 +228,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.375 s +ALOHA: aloha creates 10 routines in 0.379 s VVV1 VVV1 FFV1 @@ -260,6 +260,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 1m9.537s -user 1m8.143s -sys 0m1.359s +real 1m9.234s +user 1m8.004s +sys 0m1.096s diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 20611bb7f9..8b76866ca9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -30743,16 +30743,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) From 3348c616153949c94372fb0776d9d9047532c39c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 09:52:16 +0100 Subject: [PATCH 05/42] [omp] regenerate all 6 processes sa --- .../ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt | 10 +++++----- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 7 +++---- .../P1_Sigma_sm_epem_mupmum/check_sa.cc | 1 + epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk | 9 +++++---- .../cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 10 +++++----- .../SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc | 1 + epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 9 +++++---- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 10 +++++----- .../SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc | 1 + epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 9 +++++---- .../gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt | 14 +++++++------- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc | 1 + epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk | 9 +++++---- .../gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt | 14 +++++++------- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc | 1 + epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk | 9 +++++---- .../heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt | 6 +++--- .../SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc | 7 +++---- .../SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc | 1 + .../cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk | 9 +++++---- 24 files changed, 86 insertions(+), 80 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 705b265cde..9f9afb2c69 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -57,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006850242614746094  +DEBUG: model prefixing takes 0.006852865219116211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -194,7 +194,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.322 s +ALOHA: aloha creates 4 routines in 0.323 s FFV1 FFV1 FFV2 @@ -212,6 +212,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.903s -user 0m0.794s -sys 0m0.083s +real 0m0.929s +user 0m0.825s +sys 0m0.071s diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index e5a470b2f9..5482b4d1b8 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -899,16 +899,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index d1cf4906b3..9cb79c9c9f 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006814241409301758  +DEBUG: model prefixing takes 0.006863117218017578  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -197,7 +197,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.173 s +ALOHA: aloha creates 2 routines in 0.176 s VVV1 FFV1 FFV1 @@ -211,6 +211,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.767s -user 0m0.665s -sys 0m0.078s +real 0m0.789s +user 0m0.677s +sys 0m0.086s diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index c6d351e590..34f6c1d6b8 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -906,16 +906,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 452700e1a2..a364a7a536 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006808757781982422  +DEBUG: model prefixing takes 0.00687408447265625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.390 s +ALOHA: aloha creates 5 routines in 0.389 s VVV1 VVV1 FFV1 @@ -221,6 +221,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m1.077s -user 0m0.976s -sys 0m0.076s +real 0m1.081s +user 0m0.971s +sys 0m0.086s diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 24e897c059..b5cfebcd63 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -1124,16 +1124,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 3b3202c5ba..0b56edfa6f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068476200103759766  +DEBUG: model prefixing takes 0.006833553314208984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.201 s +1 processes with 123 diagrams generated in 0.202 s Total: 1 processes with 123 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -196,7 +196,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1308]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1319]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1330]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.545 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.540 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -204,7 +204,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.395 s +ALOHA: aloha creates 5 routines in 0.385 s VVV1 VVV1 FFV1 @@ -226,6 +226,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m1.945s -user 0m1.828s -sys 0m0.083s +real 0m1.936s +user 0m1.814s +sys 0m0.090s diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index d14c306fd2..8897055998 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -3133,16 +3133,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 08d10a400b..59e708bc12 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006876230239868164  +DEBUG: model prefixing takes 0.006831169128417969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.395 s +1 processes with 1240 diagrams generated in 2.364 s Total: 1 processes with 1240 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -198,7 +198,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1308]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1319]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1330]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 8.298 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 8.304 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.416 s +ALOHA: aloha creates 5 routines in 0.417 s VVV1 VVV1 FFV1 @@ -228,6 +228,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/ DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m16.318s -user 0m16.141s -sys 0m0.147s +real 0m16.307s +user 0m16.137s +sys 0m0.145s diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 79e72c5e6a..258db7eba4 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -32633,16 +32633,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 0b389979f4..d31fa64b1d 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -179,6 +179,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.736s -user 0m0.542s -sys 0m0.068s +real 0m0.749s +user 0m0.541s +sys 0m0.079s diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index 38d5cdc8f7..e009930829 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -860,16 +860,15 @@ namespace mg5amcCpu #else const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time #endif - /* #ifdef _OPENMP - // (NB gcc9 or higher, or clang, is required) + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ // - default(none): no variables are shared by default // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside -#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators ) +#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) #endif // _OPENMP - */ for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { // Running sum of partial amplitudes squared for event by event color selection (#402) diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc index 16eba1a383..f91ee8ebfb 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc @@ -12,6 +12,7 @@ #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" #include "epoch_process_id.h" +#include "ompnumthreads.h" #include "timermap.h" #include diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 2ecaec2b5a..14809ca31d 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -170,10 +170,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +###override OMPFLAGS = # disable OpenMP MT (default before #575) endif -###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT) -override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT) # Set the default AVX (vectorization) choice ifeq ($(AVX),) @@ -529,7 +530,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) From 60fb04f0ef89bc60625fd5b46c42a2497198a566 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 10:10:47 +0100 Subject: [PATCH 06/42] [omp] in gg_tt.mad fix OMP build for CUDA_HOME=none (add -lgomp to runTest.exe linking) --- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) From 6ee189defaa61dd57fc667951778ebfbad583100 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 10:12:22 +0100 Subject: [PATCH 07/42] [omp] in CODEGEN backport -lgomp fix --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 3c5f0f2c23..3086ecb86d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) From 716f0859332edf9236ad9bf6b8b887508a6aa9b4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 10:12:58 +0100 Subject: [PATCH 08/42] [omp] regenerate gg_tt.mad, check all ok --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 323a382e71..a8b49ace05 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -2,7 +2,6 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 1.538149356842041) ************************************************************ * * * W E L C O M E to * @@ -58,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006787538528442383  +DEBUG: model prefixing takes 0.006829977035522461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -205,17 +204,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s -Wrote files for 10 helas calls in 0.162 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Wrote files for 10 helas calls in 0.135 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.175 s +ALOHA: aloha creates 2 routines in 0.185 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.244 s +ALOHA: aloha creates 4 routines in 0.160 s VVV1 FFV1 FFV1 @@ -239,6 +238,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m7.626s -user 0m1.972s -sys 0m0.333s +real 0m2.278s +user 0m1.942s +sys 0m0.295s From bf2a2a53e0ebea92bd67ef7bbef1a174d301f627 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 10:13:32 +0100 Subject: [PATCH 09/42] [omp] manually copy ggt_mad cudacpp.mk to the other 4 mad and to all 6 sa I made a few tests manually, see logs in #575 NB One thing that I have not done is to reenable OMP tests in tmad/tput scripts. You need very large number of events and long tests to get meaningful results --- epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 14809ca31d..21621aa244 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -602,7 +602,7 @@ endif ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) From 3b886c305b6be10c1a968eb529fbcec02125760f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:03:20 +0100 Subject: [PATCH 10/42] [omp] in ggtt.sa fix OMP #575 when MULTICHANNEL is disabled #568 --- .../gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 34f6c1d6b8..19f353f1cd 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -913,7 +913,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { From 226c482b1b3b90f1ff13fcfd106bfd9917b29e79 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:05:12 +0100 Subject: [PATCH 11/42] [omp] in CODEGEN backport OMP fix with MULTICHANNEL disabled --- .../iolibs/template_files/gpu/process_sigmaKin_function.inc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 3818b0db04..97800c89e2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -103,7 +103,11 @@ // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { From 1520997c6bf4c2d2a867ebb9c03fa8e483f3d32a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:06:13 +0100 Subject: [PATCH 12/42] [omp] regenerate ggtt.sa - all is stable --- epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 9cb79c9c9f..b7f1fe3a3e 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006863117218017578  +DEBUG: model prefixing takes 0.006822347640991211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -197,7 +197,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.176 s +ALOHA: aloha creates 2 routines in 0.174 s VVV1 FFV1 FFV1 @@ -211,6 +211,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.789s -user 0m0.677s -sys 0m0.086s +real 0m0.774s +user 0m0.670s +sys 0m0.079s From 80b53a1972b806f93045ad44ce32cba7f77d112a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:10:51 +0100 Subject: [PATCH 13/42] [omp] regenerate all 6 sa and 5 mad - complete reenabling of OMP MT #575 (but clang fails build) --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 14 ++++++------- .../SubProcesses/P1_ll_ll/CPPProcess.cc | 4 ++++ .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 +++++----- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 4 ++++ .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 +++++++-------- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 4 ++++ .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 10 +++++----- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 14 ++++++------- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 4 ++++ .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 10 +++++----- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 4 ++++ .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 18 ++++++++--------- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 4 ++++ .../CODEGEN_cudacpp_gg_ttgg_log.txt | 12 +++++------ .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 4 ++++ .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 20 +++++++++---------- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 4 ++++ .../CODEGEN_cudacpp_gg_ttggg_log.txt | 14 ++++++------- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 4 ++++ .../CODEGEN_cudacpp_heft_gg_h_log.txt | 8 ++++---- .../P1_Sigma_heft_gg_h/CPPProcess.cc | 4 ++++ 21 files changed, 113 insertions(+), 73 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index a9e90bce22..cf0a7c42ac 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -57,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006835222244262695  +DEBUG: model prefixing takes 0.0068280696868896484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -168,7 +168,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_ll_ll -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -206,14 +206,14 @@ ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.267 s +ALOHA: aloha creates 3 routines in 0.239 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.323 s +ALOHA: aloha creates 7 routines in 0.304 s FFV1 FFV1 FFV2 @@ -241,6 +241,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.496s -user 0m2.127s -sys 0m0.289s +real 0m2.483s +user 0m2.116s +sys 0m0.298s diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc index 702a9a5a97..f8e8fd0291 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc @@ -908,7 +908,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 9f9afb2c69..c3256376bf 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -57,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006852865219116211  +DEBUG: model prefixing takes 0.006785869598388672  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -194,7 +194,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.323 s +ALOHA: aloha creates 4 routines in 0.319 s FFV1 FFV1 FFV2 @@ -212,6 +212,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.929s -user 0m0.825s -sys 0m0.071s +real 0m0.902s +user 0m0.802s +sys 0m0.075s diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 5482b4d1b8..fbc65951b6 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -906,7 +906,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a8b49ace05..3249588141 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006829977035522461  +DEBUG: model prefixing takes 0.006859540939331055  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -204,12 +204,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s -Wrote files for 10 helas calls in 0.135 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s +Wrote files for 10 helas calls in 0.134 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.185 s +ALOHA: aloha creates 2 routines in 0.177 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 @@ -238,6 +238,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.278s -user 0m1.942s -sys 0m0.295s +real 0m2.350s +user 0m1.934s +sys 0m0.293s diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index f92c5bfc3e..6074c6e177 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -916,7 +916,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index b7f1fe3a3e..0ab3db5f16 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006822347640991211  +DEBUG: model prefixing takes 0.006834268569946289  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -197,7 +197,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.174 s +ALOHA: aloha creates 2 routines in 0.172 s VVV1 FFV1 FFV1 @@ -211,6 +211,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.774s -user 0m0.670s -sys 0m0.079s +real 0m0.771s +user 0m0.663s +sys 0m0.083s diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 8dfa6e6ea2..0003fb2396 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00680088996887207  +DEBUG: model prefixing takes 0.006825447082519531  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -214,7 +214,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.446 s +ALOHA: aloha creates 5 routines in 0.391 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -222,7 +222,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.380 s +ALOHA: aloha creates 10 routines in 0.370 s VVV1 VVV1 FFV1 @@ -251,6 +251,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.976s -user 0m2.579s -sys 0m0.313s +real 0m2.872s +user 0m2.566s +sys 0m0.286s diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 201b78594d..53ef30130e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -1137,7 +1137,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index a364a7a536..e33ec3d8d2 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00687408447265625  +DEBUG: model prefixing takes 0.006829023361206055  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.389 s +ALOHA: aloha creates 5 routines in 0.390 s VVV1 VVV1 FFV1 @@ -221,6 +221,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m1.081s -user 0m0.971s -sys 0m0.086s +real 0m1.079s +user 0m0.979s +sys 0m0.076s diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index b5cfebcd63..c46f1cc8e0 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -1131,7 +1131,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 321d5a9829..6e338ad47e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006788969039916992  +DEBUG: model prefixing takes 0.0068132877349853516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -208,15 +208,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.546 s -Wrote files for 222 helas calls in 0.928 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.542 s +Wrote files for 222 helas calls in 0.926 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.397 s +ALOHA: aloha creates 5 routines in 0.639 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -224,7 +224,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.377 s +ALOHA: aloha creates 10 routines in 0.617 s VVV1 VVV1 FFV1 @@ -256,6 +256,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m4.506s -user 0m4.192s -sys 0m0.292s +real 0m5.099s +user 0m4.185s +sys 0m0.305s diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 24a304017f..b422bcb887 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -3083,7 +3083,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 0b56edfa6f..50b9dc624d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006833553314208984  +DEBUG: model prefixing takes 0.006829977035522461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.202 s +1 processes with 123 diagrams generated in 0.201 s Total: 1 processes with 123 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -204,7 +204,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.385 s +ALOHA: aloha creates 5 routines in 0.526 s VVV1 VVV1 FFV1 @@ -226,6 +226,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m1.936s -user 0m1.814s -sys 0m0.090s +real 0m2.076s +user 0m1.824s +sys 0m0.083s diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index 8897055998..b98b95fc0b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -3140,7 +3140,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 5631ab8bd7..510108d820 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068318843841552734  +DEBUG: model prefixing takes 0.006801605224609375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.396 s +1 processes with 1240 diagrams generated in 2.385 s Total: 1 processes with 1240 diagrams output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -171,7 +171,7 @@ INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1592 term in 40s. Introduce 2768 contraction -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -212,15 +212,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 8.326 s -Wrote files for 2281 helas calls in 53.171 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 8.383 s +Wrote files for 2281 helas calls in 53.334 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.455 s +ALOHA: aloha creates 5 routines in 0.414 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -228,7 +228,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.379 s +ALOHA: aloha creates 10 routines in 0.818 s VVV1 VVV1 FFV1 @@ -260,6 +260,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 1m9.234s -user 1m8.004s -sys 0m1.096s +real 1m9.884s +user 1m8.146s +sys 0m1.174s diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 8b76866ca9..08d4b11eac 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -30750,7 +30750,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 59e708bc12..2092433d03 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006831169128417969  +DEBUG: model prefixing takes 0.006810188293457031  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.364 s +1 processes with 1240 diagrams generated in 3.188 s Total: 1 processes with 1240 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -198,7 +198,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1308]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1319]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1330]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 8.304 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 8.323 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.417 s +ALOHA: aloha creates 5 routines in 0.415 s VVV1 VVV1 FFV1 @@ -228,6 +228,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/ DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m16.307s -user 0m16.137s -sys 0m0.145s +real 0m17.159s +user 0m16.185s +sys 0m0.167s diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index 258db7eba4..8fb76d7ed5 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -32640,7 +32640,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index d31fa64b1d..60555bdbe2 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -168,7 +168,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.073 s +ALOHA: aloha creates 1 routines in 0.074 s VVS3 FileWriter for /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src/. @@ -179,6 +179,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.749s -user 0m0.541s -sys 0m0.079s +real 0m0.652s +user 0m0.544s +sys 0m0.080s diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index e009930829..6402043d11 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -867,7 +867,11 @@ namespace mg5amcCpu // - shared: as the name says // - private: give each thread its own copy, without initialising // - firstprivate: give each thread its own copy, and initialise with value from outside +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL #pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 ) +#else +#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 ) +#endif #endif // _OPENMP for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) { From 34b3487647e1945f75838e5a70fa83aa3a6b376a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:30:53 +0100 Subject: [PATCH 14/42] [omp] in ggtt.mad try to fix OMP with clang - stil fails, will revert --- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 21621aa244..ab22993806 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -599,14 +599,20 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +$(testmain): LIBFLAGS += -lomp +else +$(testmain): LIBFLAGS += -lgomp +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 From a8be5198eb4506feff774ca97f4b68e51f82264b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:31:13 +0100 Subject: [PATCH 15/42] Revert "[omp] in ggtt.mad try to fix OMP with clang - stil fails, will revert" This reverts commit 3ec8cf25ecccb9bed2bc450c33f2b4e2c81b7b5b. --- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index ab22993806..21621aa244 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -599,20 +599,14 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif -ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) -$(testmain): LIBFLAGS += -lomp -else -$(testmain): LIBFLAGS += -lgomp -endif - ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 From e640b1d7f39dd2fd91a2da8110547ab69d29d08c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:37:14 +0100 Subject: [PATCH 16/42] [omp] in ggtt.mad try another fix for OMP in clang, still fails, will revert --- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 21621aa244..6e599a8a7e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -171,8 +171,11 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = -fopenmp=libiomp5 # openmp for clang see https://stackoverflow.com/questions/33357029/using-openmp-with-clang +###override OMPFLAGS = # disable OpenMP MT (default before #575) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif From e15602d52ac1f2938e49116f737873cd0e2e6697 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:37:33 +0100 Subject: [PATCH 17/42] Revert "[omp] try another fix for OMP in clang, still fails, will revert" This reverts commit 2b47ccfb99700c0fb4852538fe2e1fb313732362. --- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 6e599a8a7e..21621aa244 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -171,11 +171,8 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | grep ^Intel),) override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = -fopenmp=libiomp5 # openmp for clang see https://stackoverflow.com/questions/33357029/using-openmp-with-clang -###override OMPFLAGS = # disable OpenMP MT (default before #575) else -override OMPFLAGS = -fopenmp +override OMPFLAGS = -fopenmp # enable OpenMP MT #575 ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif From 4a4d9ec963434df708741dd08dee9966660e800f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:55:22 +0100 Subject: [PATCH 18/42] [omp] in ggtt.sa disable OMP in clang Tested ok on gcc112/clang13/icpx2022, all with/without nvcc --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 21621aa244..b90a3ecbe7 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -169,10 +169,10 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +override OMPFLAGS = # disable OpenMP MT on clang and icpx else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +599,20 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifeq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 From 296b7c1ccfd6f77b0abe2146f70dfe4de32424b9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 12:57:22 +0100 Subject: [PATCH 19/42] [omp] in ggtt.sa timermap.h fix an icpx build warning --- epochX/cudacpp/gg_tt.sa/SubProcesses/timermap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/timermap.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) From 20570fd0f1f7c6094c3806a4e61491c5d65348ec Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:25:54 +0100 Subject: [PATCH 20/42] [omp] in ggtt.mad port the timermap build warning fix from ggtt.sa --- epochX/cudacpp/gg_tt.mad/SubProcesses/timermap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/timermap.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) From 9423f4e1b46b1600d1981bea72eb0a0575e524a8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:26:33 +0100 Subject: [PATCH 21/42] [omp] in ggtt.mad port the omp build fixes in cudacpp.mk from ggtt.sa --- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 21621aa244..b90a3ecbe7 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -169,10 +169,10 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +override OMPFLAGS = # disable OpenMP MT on clang and icpx else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +599,20 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifeq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 From ea17b20c497045eeaccd5e913955aac71b5d1d7b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:34:27 +0100 Subject: [PATCH 22/42] [omp] in ggtt.mad ompnumthreads.cc disable the build if _OPENMP is not defined --- .../cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc index e319a0a926..ee05a1162c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc @@ -8,6 +8,7 @@ // Hence use 'extern "C"' to avoid name mangling by the C++ compiler // See https://www.geeksforgeeks.org/extern-c-in-c +#ifdef _OPENMP extern "C" { void ompnumthreads_not_set_means_one_thread_() @@ -16,3 +17,4 @@ extern "C" ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file } } +#endif From a7301b62ad49914227b19c85c57c5707532ee7ec Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:36:18 +0100 Subject: [PATCH 23/42] [omp] in ggtt.mad driver.f disable OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD if _OPENMP is not defined --- epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f | 2 ++ 1 file changed, 2 insertions(+) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f index bad74950a1..ab38b2202e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f @@ -88,7 +88,9 @@ Program DRIVER call cpu_time(t_before) CUMULATED_TIMING = t_before +#ifdef _OPENMP CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD() +#endif CALL COUNTERS_INITIALISE() c#ifdef MG5AMC_MEEXPORTER_CUDACPP From a8301bf32d4d658c8dbeeadbf71c255297ebe527 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:39:12 +0100 Subject: [PATCH 24/42] [omp] in ggttmad makefile disable OMP for icpx in the build of Fortran/madevent software Tested on icpx with/without nvcc --- .../cudacpp/gg_tt.mad/SubProcesses/makefile | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile index b7e084145e..1465fdbac4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile @@ -94,8 +94,14 @@ else all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (#503) endif +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on icpx +else +override OMPFLAGS = -fopenmp +endif + $(PROG): $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o $(LDFLAGS) + $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) $(LIBS): .libs @@ -122,17 +128,17 @@ endif # Also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi + $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi counters.o: counters.cc timer.h $(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@ ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -std=c++11 -Wall -Wshadow -Wextra -fopenmp -c $< -o $@ + $(CXX) -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@ $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) @@ -150,11 +156,11 @@ endif # Add source so that the compiler finds the DiscreteSampler module. $(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ $(OMPFLAGS) %.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c $< -I../../Source/ $(OMPFLAGS) -o $@ %_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ # Dependencies From 8ce243e92581e0b1b1ac25cf9905eb3a2126903c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:50:28 +0100 Subject: [PATCH 25/42] [omp] in ggttmad makefile, reenable OMP for icpx in Fortran/madevent, fix build error about missing intel_fast_copy This and previous timermap/driver/ompnumthreads fixes for openmp are part of the #561 patch --- epochX/cudacpp/gg_tt.mad/SubProcesses/makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile index 1465fdbac4..365e9d0ed2 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile @@ -95,7 +95,8 @@ all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp endif ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = # disable OpenMP MT on icpx +override OMPFLAGS = -fopenmp +LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' else override OMPFLAGS = -fopenmp endif From 6788accf611f788df89438c16ca7562074eff085 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:56:19 +0100 Subject: [PATCH 26/42] [omp] in CODEGEN port the omp build fixes in cudacpp.mk from ggtt.sa/mad --- .../iolibs/template_files/gpu/cudacpp.mk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 3086ecb86d..608655dd27 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -169,10 +169,10 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +override OMPFLAGS = # disable OpenMP MT on clang and icpx else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +599,20 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %%bin/clang++,%%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifeq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 From 42f5609b7c8a8fe00149757e717869e427fd18ea Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:57:52 +0100 Subject: [PATCH 27/42] [omp] in CODEGEN ompnumthreads.cc disable the build if _OPENMP is not defined, port from ggtt.mad --- epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/ompnumthreads.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/ompnumthreads.cc b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/ompnumthreads.cc index e319a0a926..ee05a1162c 100644 --- a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/ompnumthreads.cc +++ b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/ompnumthreads.cc @@ -8,6 +8,7 @@ // Hence use 'extern "C"' to avoid name mangling by the C++ compiler // See https://www.geeksforgeeks.org/extern-c-in-c +#ifdef _OPENMP extern "C" { void ompnumthreads_not_set_means_one_thread_() @@ -16,3 +17,4 @@ extern "C" ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file } } +#endif From e7a78c19742167bcd82745eb37f5d5956ab469a5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 13:59:50 +0100 Subject: [PATCH 28/42] [omp] in CODEGEN backport to patch.P1 and patch.common ./CODEGEN/generateAndCompare.sh gg_tt --mad --nopatch git diff --no-ext-diff -R gg_tt.mad/Source/dsample.f gg_tt.mad/Source/genps.inc gg_tt.mad/Source/vector.inc gg_tt.mad/SubProcesses/makefile > CODEGEN/MG5aMC_patches/PROD/patch.common git diff --no-ext-diff -R gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f > CODEGEN/MG5aMC_patches/PROD/patch.P1 git checkout gg_tt.mad --- .../CODEGEN/MG5aMC_patches/PROD/patch.P1 | 16 +++++---- .../CODEGEN/MG5aMC_patches/PROD/patch.common | 33 ++++++++++++------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 index 8659e871be..2e113d777a 100644 --- a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 +++ b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 @@ -1,5 +1,5 @@ diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f -index 62b656862..0ae2524b4 100644 +index aa01cb976..50d82f805 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f @@ -463,23 +463,140 @@ C @@ -157,11 +157,11 @@ index 62b656862..0ae2524b4 100644 END diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f -index 295e7de8d..19aa50965 100644 +index a76de8ec5..ab38b2202 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f -@@ -74,13 +74,52 @@ c common/to_colstats/ncols,ncolflow,ncolalt,ic - include 'vector.inc' +@@ -74,13 +74,54 @@ c common/to_colstats/ncols,ncolflow,ncolalt,ic + include 'vector.inc' ! needed by coupl.inc (defines VECSIZE_MEMMAX) include 'coupl.inc' INTEGER VECSIZE_USED - DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime @@ -179,7 +179,9 @@ index 295e7de8d..19aa50965 100644 call cpu_time(t_before) CUMULATED_TIMING = t_before + ++#ifdef _OPENMP + CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD() ++#endif + CALL COUNTERS_INITIALISE() + +c#ifdef MG5AMC_MEEXPORTER_CUDACPP @@ -214,7 +216,7 @@ index 295e7de8d..19aa50965 100644 c c Read process number c -@@ -135,7 +174,8 @@ c If CKKW-type matching, read IS Sudakov grid +@@ -135,7 +176,8 @@ c If CKKW-type matching, read IS Sudakov grid exit 30 issgridfile='../'//issgridfile if(i.eq.5)then @@ -224,7 +226,7 @@ index 295e7de8d..19aa50965 100644 stop endif enddo -@@ -202,8 +242,33 @@ c call sample_result(xsec,xerr) +@@ -202,8 +244,33 @@ c call sample_result(xsec,xerr) c write(*,*) 'Final xsec: ',xsec rewind(lun) @@ -259,7 +261,7 @@ index 295e7de8d..19aa50965 100644 end c $B$ get_user_params $B$ ! tag for MadWeight -@@ -381,7 +446,7 @@ c +@@ -381,7 +448,7 @@ c fopened=.false. tempname=filename fine=index(tempname,' ') diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common index fdc5af9289..82f69ac450 100644 --- a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common +++ b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common @@ -24,7 +24,7 @@ index a6907622e..3c1e4fdf8 100644 + PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU +c PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498) diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile -index dd709f52c..b7e084145 100644 +index dd709f52c..365e9d0ed 100644 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile @@ -1,6 +1,19 @@ @@ -75,7 +75,7 @@ index dd709f52c..b7e084145 100644 LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS) -@@ -43,24 +75,69 @@ ifeq ($(strip $(MATRIX_HEL)),) +@@ -43,24 +75,76 @@ ifeq ($(strip $(MATRIX_HEL)),) endif @@ -103,8 +103,15 @@ index dd709f52c..b7e084145 100644 +all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (#503) +endif + ++ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ++override OMPFLAGS = -fopenmp ++LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' ++else ++override OMPFLAGS = -fopenmp ++endif ++ +$(PROG): $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o -+ $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o $(LDFLAGS) ++ $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) + +$(LIBS): .libs + @@ -131,17 +138,18 @@ index dd709f52c..b7e084145 100644 + +# Also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503) +$(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs -+ $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) -+ if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi ++ $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) ++ if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi + +counters.o: counters.cc timer.h + $(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@ + +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h -+ $(CXX) -std=c++11 -Wall -Wshadow -Wextra -fopenmp -c $< -o $@ ++ $(CXX) -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@ $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp +- $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp ++ $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) gensym: $(SYMMETRY) configs.inc $(LIBS) - $(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS) @@ -151,7 +159,7 @@ index dd709f52c..b7e084145 100644 $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat cd ../../Source/MODEL; make -@@ -69,12 +146,15 @@ $(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat +@@ -69,12 +153,15 @@ $(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat $(LIBDIR)libpdf.$(libext): cd ../../Source/PDF; make @@ -159,16 +167,17 @@ index dd709f52c..b7e084145 100644 # Add source so that the compiler finds the DiscreteSampler module. $(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp +- $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp ++ $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ $(OMPFLAGS) %.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -+ $(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -o $@ ++ $(FC) $(FFLAGS) -c $< -I../../Source/ $(OMPFLAGS) -o $@ +%_cudacpp.o: %.f -+ $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ -fopenmp -o $@ ++ $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ # Dependencies -@@ -94,5 +174,71 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ +@@ -94,5 +181,71 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \ run_config.inc initcluster.o: message.inc From ccdda176ce124e4dcc760c4659b8feec4485bff7 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:01:19 +0100 Subject: [PATCH 29/42] [omp] in CODEGEN backport timermap.h with icpx fixes --- .../madgraph/iolibs/template_files/gpu/timermap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) From 85d9b333489bdaac2531527008970dbfa6b80347 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:02:00 +0100 Subject: [PATCH 30/42] [omp] regenerate ggtt mad and sa, both stable --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 20 +++++++++---------- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 +++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 3249588141..af45e39419 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006859540939331055  +DEBUG: model prefixing takes 0.005887031555175781  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.010 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -204,17 +204,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s -Wrote files for 10 helas calls in 0.134 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Wrote files for 10 helas calls in 0.112 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.177 s +ALOHA: aloha creates 2 routines in 0.140 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.160 s +ALOHA: aloha creates 4 routines in 0.126 s VVV1 FFV1 FFV1 @@ -238,6 +238,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.350s -user 0m1.934s -sys 0m0.293s +real 0m2.163s +user 0m1.749s +sys 0m0.355s diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 0ab3db5f16..395dcb5536 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006834268569946289  +DEBUG: model prefixing takes 0.005887031555175781  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.010 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_SA_OUTPUT @@ -192,12 +192,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1308]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1319]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1330]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.172 s +ALOHA: aloha creates 2 routines in 0.141 s VVV1 FFV1 FFV1 @@ -211,6 +211,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.771s -user 0m0.663s +real 0m0.709s +user 0m0.581s sys 0m0.083s From 52dc5b57f618376365b2e0addef8abd1e17fe519 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:03:48 +0100 Subject: [PATCH 31/42] [omp] in ggttsa separate clang and Intel for openmp in cudacpp.mk --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index b90a3ecbe7..1ca507333b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -169,8 +169,10 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) -override OMPFLAGS = # disable OpenMP MT on clang and icpx +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on icpx +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang else override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) From b8af3a8790aa6f93d912ea43ebdcbde2fe5bdfd9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:16:29 +0100 Subject: [PATCH 32/42] [omp] in ggtt.sa try again to add omp for icpx, dfails and will revert --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 1ca507333b..efe69d944c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -170,7 +170,8 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = # disable OpenMP MT on icpx +override OMPFLAGS = -fopenmp +###override OMPFLAGS = # disable OpenMP MT on icpx else ifneq ($(shell $(CXX) --version | egrep '^clang'),) override OMPFLAGS = # disable OpenMP MT on clang else @@ -607,10 +608,17 @@ $(testmain): LIBFLAGS += -lgomp endif endif +$(testmain): LIBFLAGS += -lstdc++ # link with FC +$(testmain): LIBFLAGS += -openmp +ifneq ($(shell $(CXX) --version | grep ^Intel),) +$(testmain): LIBFLAGS += -lintlc -lsvml +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) +#$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) From e7630773f7a7ac3c1ae073ad2a0a4b6f1deab670 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:16:53 +0100 Subject: [PATCH 33/42] Revert "[omp] in ggtt.sa try again to add omp for icpx, dfails and will revert" This reverts commit 25a9d296523717450ed34e68e861327059f4ffdf. --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index efe69d944c..1ca507333b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -170,8 +170,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on icpx +override OMPFLAGS = # disable OpenMP MT on icpx else ifneq ($(shell $(CXX) --version | egrep '^clang'),) override OMPFLAGS = # disable OpenMP MT on clang else @@ -608,17 +607,10 @@ $(testmain): LIBFLAGS += -lgomp endif endif -$(testmain): LIBFLAGS += -lstdc++ # link with FC -$(testmain): LIBFLAGS += -openmp -ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc -lsvml -endif - ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) -#$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) - $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) From aad2eb3da8d18aaeb5f5a821e2557595bee822a6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:25:56 +0100 Subject: [PATCH 34/42] [omp] in ggtt.sa partial fix for OMP on icpx in cudacpp.mk: ok without nvcc, not ok with nvcc --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 1ca507333b..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -170,9 +170,9 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = # disable OpenMP MT on icpx +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override OMPFLAGS = # disable OpenMP MT on clang +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) @@ -602,7 +602,11 @@ $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstwor endif ifneq ($(OMPFLAGS),) -ifeq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else $(testmain): LIBFLAGS += -lgomp endif endif From ea85f24ad4b471bd2d0e3b90ef326b08bf9955b5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:37:51 +0100 Subject: [PATCH 35/42] [omp] in CODEGEN backport the last attempts from ggtt.sa to fix omp on icpx (only partly done, without nvcc) --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 608655dd27..faaf1033ba 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -169,8 +169,10 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) -override OMPFLAGS = # disable OpenMP MT on clang and icpx +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) @@ -600,7 +602,11 @@ $(testmain): LIBFLAGS += -L$(patsubst %%bin/clang++,%%lib,$(shell which $(firstw endif ifneq ($(OMPFLAGS),) -ifeq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else $(testmain): LIBFLAGS += -lgomp endif endif From ef2ab6e08e9896527db797a4ad884c104530dc0f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:38:42 +0100 Subject: [PATCH 36/42] [omp] regenerate ggttsa, all stable --- .../cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 395dcb5536..a67464299d 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005887031555175781  +DEBUG: model prefixing takes 0.006864786148071289  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.010 s Total: 1 processes with 3 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_SA_OUTPUT @@ -192,12 +192,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1308]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1319]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1330]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.141 s +ALOHA: aloha creates 2 routines in 0.173 s VVV1 FFV1 FFV1 @@ -211,6 +211,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.709s -user 0m0.581s -sys 0m0.083s +real 0m0.900s +user 0m0.685s +sys 0m0.078s From 460a598e112b4470ca35ef8abd4e82469e82bbd4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:39:11 +0100 Subject: [PATCH 37/42] [omp] regenerate ggtt mad --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 20 +++++++++---------- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 12 ++++++++--- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index af45e39419..d32971ec2f 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005887031555175781  +DEBUG: model prefixing takes 0.0068433284759521484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.010 s Total: 1 processes with 3 diagrams output madevent CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -204,17 +204,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.112 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Wrote files for 10 helas calls in 0.134 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.140 s +ALOHA: aloha creates 2 routines in 0.174 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.126 s +ALOHA: aloha creates 4 routines in 0.158 s VVV1 FFV1 FFV1 @@ -238,6 +238,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.163s -user 0m1.749s -sys 0m0.355s +real 0m2.258s +user 0m1.927s +sys 0m0.301s diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index b90a3ecbe7..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -169,8 +169,10 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) -override OMPFLAGS = # disable OpenMP MT on clang and icpx +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) @@ -600,7 +602,11 @@ $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstwor endif ifneq ($(OMPFLAGS),) -ifeq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else $(testmain): LIBFLAGS += -lgomp endif endif From e7a3a0e60d590c2e438178fa32bdf4173e06d8d9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 14:51:07 +0100 Subject: [PATCH 38/42] [omp] regenerate 5 mad and 6 sa - completed OMP reenabling on gcc #575 (not yet on icpx clang #578) --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 16 ++++++------- .../SubProcesses/P1_ll_ll/driver.f | 2 ++ .../SubProcesses/P1_ll_ll/ompnumthreads.cc | 2 ++ .../ee_mumu.mad/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../cudacpp/ee_mumu.mad/SubProcesses/makefile | 23 ++++++++++++------- .../ee_mumu.mad/SubProcesses/timermap.h | 4 ++-- .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 ++++---- .../ee_mumu.sa/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../ee_mumu.sa/SubProcesses/timermap.h | 4 ++-- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 ++++++------- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 10 ++++---- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 16 ++++++------- .../SubProcesses/P1_gg_ttxg/driver.f | 2 ++ .../SubProcesses/P1_gg_ttxg/ompnumthreads.cc | 2 ++ .../gg_ttg.mad/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../cudacpp/gg_ttg.mad/SubProcesses/makefile | 23 ++++++++++++------- .../gg_ttg.mad/SubProcesses/timermap.h | 4 ++-- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 10 ++++---- .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../cudacpp/gg_ttg.sa/SubProcesses/timermap.h | 4 ++-- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 18 +++++++-------- .../SubProcesses/P1_gg_ttxgg/driver.f | 2 ++ .../SubProcesses/P1_gg_ttxgg/ompnumthreads.cc | 2 ++ .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../cudacpp/gg_ttgg.mad/SubProcesses/makefile | 23 ++++++++++++------- .../gg_ttgg.mad/SubProcesses/timermap.h | 4 ++-- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 14 +++++------ .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../gg_ttgg.sa/SubProcesses/timermap.h | 4 ++-- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 20 ++++++++-------- .../SubProcesses/P1_gg_ttxggg/driver.f | 2 ++ .../P1_gg_ttxggg/ompnumthreads.cc | 2 ++ .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../gg_ttggg.mad/SubProcesses/makefile | 23 ++++++++++++------- .../gg_ttggg.mad/SubProcesses/timermap.h | 4 ++-- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 14 +++++------ .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../gg_ttggg.sa/SubProcesses/timermap.h | 4 ++-- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 6 ++--- .../heft_gg_h.sa/SubProcesses/cudacpp.mk | 22 ++++++++++++++---- .../heft_gg_h.sa/SubProcesses/timermap.h | 4 ++-- 41 files changed, 322 insertions(+), 170 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index cf0a7c42ac..15bd90b9c8 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -57,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068280696868896484  +DEBUG: model prefixing takes 0.006861448287963867  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -168,7 +168,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_ll_ll -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -201,19 +201,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group ll_ll Generated helas calls for 1 subprocesses (2 diagrams) in 0.005 s -Wrote files for 8 helas calls in 0.118 s +Wrote files for 8 helas calls in 0.117 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.239 s +ALOHA: aloha creates 3 routines in 0.241 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.304 s +ALOHA: aloha creates 7 routines in 0.307 s FFV1 FFV1 FFV2 @@ -241,6 +241,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.483s -user 0m2.116s -sys 0m0.298s +real 0m2.451s +user 0m2.118s +sys 0m0.303s diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/driver.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/driver.f index bad74950a1..ab38b2202e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/driver.f +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/driver.f @@ -88,7 +88,9 @@ Program DRIVER call cpu_time(t_before) CUMULATED_TIMING = t_before +#ifdef _OPENMP CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD() +#endif CALL COUNTERS_INITIALISE() c#ifdef MG5AMC_MEEXPORTER_CUDACPP diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/ompnumthreads.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/ompnumthreads.cc index e319a0a926..ee05a1162c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/ompnumthreads.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/ompnumthreads.cc @@ -8,6 +8,7 @@ // Hence use 'extern "C"' to avoid name mangling by the C++ compiler // See https://www.geeksforgeeks.org/extern-c-in-c +#ifdef _OPENMP extern "C" { void ompnumthreads_not_set_means_one_thread_() @@ -16,3 +17,4 @@ extern "C" ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file } } +#endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile index b7e084145e..365e9d0ed2 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/makefile @@ -94,8 +94,15 @@ else all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (#503) endif +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = -fopenmp +LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' +else +override OMPFLAGS = -fopenmp +endif + $(PROG): $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o $(LDFLAGS) + $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) $(LIBS): .libs @@ -122,17 +129,17 @@ endif # Also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi + $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi counters.o: counters.cc timer.h $(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@ ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -std=c++11 -Wall -Wshadow -Wextra -fopenmp -c $< -o $@ + $(CXX) -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@ $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) @@ -150,11 +157,11 @@ endif # Add source so that the compiler finds the DiscreteSampler module. $(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ $(OMPFLAGS) %.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c $< -I../../Source/ $(OMPFLAGS) -o $@ %_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ # Dependencies diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/timermap.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/timermap.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index c3256376bf..c447b2a7b0 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -57,7 +57,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006785869598388672  +DEBUG: model prefixing takes 0.006835460662841797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -194,7 +194,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.319 s +ALOHA: aloha creates 4 routines in 0.321 s FFV1 FFV1 FFV2 @@ -212,6 +212,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.902s -user 0m0.802s -sys 0m0.075s +real 0m0.900s +user 0m0.801s +sys 0m0.073s diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/timermap.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/timermap.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index d32971ec2f..ecb9d916e4 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068433284759521484  +DEBUG: model prefixing takes 0.006825685501098633  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -204,17 +204,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s +Generated helas calls for 1 subprocesses (3 diagrams) in 0.008 s Wrote files for 10 helas calls in 0.134 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.174 s +ALOHA: aloha creates 2 routines in 0.175 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.158 s +ALOHA: aloha creates 4 routines in 0.159 s VVV1 FFV1 FFV1 @@ -238,6 +238,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.258s -user 0m1.927s -sys 0m0.301s +real 0m2.239s +user 0m1.944s +sys 0m0.272s diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index a67464299d..3aab08d216 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006864786148071289  +DEBUG: model prefixing takes 0.006810903549194336  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -197,7 +197,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.173 s +ALOHA: aloha creates 2 routines in 0.172 s VVV1 FFV1 FFV1 @@ -211,6 +211,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.900s -user 0m0.685s -sys 0m0.078s +real 0m0.773s +user 0m0.683s +sys 0m0.065s diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 0003fb2396..dc4d1f0187 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006825447082519531  +DEBUG: model prefixing takes 0.006867170333862305  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -207,14 +207,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg Generated helas calls for 1 subprocesses (16 diagrams) in 0.048 s -Wrote files for 36 helas calls in 0.203 s +Wrote files for 36 helas calls in 0.201 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.391 s +ALOHA: aloha creates 5 routines in 0.467 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -222,7 +222,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.370 s +ALOHA: aloha creates 10 routines in 0.376 s VVV1 VVV1 FFV1 @@ -251,6 +251,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.872s -user 0m2.566s -sys 0m0.286s +real 0m2.963s +user 0m2.548s +sys 0m0.312s diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f index 3da7398e4c..22d3aaea6d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f @@ -88,7 +88,9 @@ Program DRIVER call cpu_time(t_before) CUMULATED_TIMING = t_before +#ifdef _OPENMP CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD() +#endif CALL COUNTERS_INITIALISE() c#ifdef MG5AMC_MEEXPORTER_CUDACPP diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc index e319a0a926..ee05a1162c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc @@ -8,6 +8,7 @@ // Hence use 'extern "C"' to avoid name mangling by the C++ compiler // See https://www.geeksforgeeks.org/extern-c-in-c +#ifdef _OPENMP extern "C" { void ompnumthreads_not_set_means_one_thread_() @@ -16,3 +17,4 @@ extern "C" ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file } } +#endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile index b7e084145e..365e9d0ed2 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/makefile @@ -94,8 +94,15 @@ else all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (#503) endif +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = -fopenmp +LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' +else +override OMPFLAGS = -fopenmp +endif + $(PROG): $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o $(LDFLAGS) + $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) $(LIBS): .libs @@ -122,17 +129,17 @@ endif # Also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi + $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi counters.o: counters.cc timer.h $(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@ ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -std=c++11 -Wall -Wshadow -Wextra -fopenmp -c $< -o $@ + $(CXX) -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@ $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) @@ -150,11 +157,11 @@ endif # Add source so that the compiler finds the DiscreteSampler module. $(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ $(OMPFLAGS) %.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c $< -I../../Source/ $(OMPFLAGS) -o $@ %_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ # Dependencies diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/timermap.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index e33ec3d8d2..c8b72d4c03 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006829023361206055  +DEBUG: model prefixing takes 0.006863594055175781  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -202,7 +202,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.390 s +ALOHA: aloha creates 5 routines in 0.389 s VVV1 VVV1 FFV1 @@ -221,6 +221,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m1.079s -user 0m0.979s -sys 0m0.076s +real 0m1.078s +user 0m0.986s +sys 0m0.067s diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/timermap.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 6e338ad47e..c59259a150 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0068132877349853516  +DEBUG: model prefixing takes 0.006790876388549805  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -169,7 +169,7 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -208,15 +208,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.542 s -Wrote files for 222 helas calls in 0.926 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.539 s +Wrote files for 222 helas calls in 0.923 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.639 s +ALOHA: aloha creates 5 routines in 0.419 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -224,7 +224,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.617 s +ALOHA: aloha creates 10 routines in 0.377 s VVV1 VVV1 FFV1 @@ -256,6 +256,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m5.099s -user 0m4.185s -sys 0m0.305s +real 0m4.558s +user 0m4.153s +sys 0m0.309s diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f index 3c9005a4c1..84e692eb38 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f @@ -88,7 +88,9 @@ Program DRIVER call cpu_time(t_before) CUMULATED_TIMING = t_before +#ifdef _OPENMP CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD() +#endif CALL COUNTERS_INITIALISE() c#ifdef MG5AMC_MEEXPORTER_CUDACPP diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc index e319a0a926..ee05a1162c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc @@ -8,6 +8,7 @@ // Hence use 'extern "C"' to avoid name mangling by the C++ compiler // See https://www.geeksforgeeks.org/extern-c-in-c +#ifdef _OPENMP extern "C" { void ompnumthreads_not_set_means_one_thread_() @@ -16,3 +17,4 @@ extern "C" ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file } } +#endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile index b7e084145e..365e9d0ed2 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/makefile @@ -94,8 +94,15 @@ else all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (#503) endif +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = -fopenmp +LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' +else +override OMPFLAGS = -fopenmp +endif + $(PROG): $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o $(LDFLAGS) + $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) $(LIBS): .libs @@ -122,17 +129,17 @@ endif # Also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi + $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi counters.o: counters.cc timer.h $(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@ ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -std=c++11 -Wall -Wshadow -Wextra -fopenmp -c $< -o $@ + $(CXX) -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@ $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) @@ -150,11 +157,11 @@ endif # Add source so that the compiler finds the DiscreteSampler module. $(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ $(OMPFLAGS) %.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c $< -I../../Source/ $(OMPFLAGS) -o $@ %_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ # Dependencies diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/timermap.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 50b9dc624d..c553e3ec15 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006829977035522461  +DEBUG: model prefixing takes 0.006834983825683594  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.201 s +1 processes with 123 diagrams generated in 0.205 s Total: 1 processes with 123 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -196,7 +196,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1308]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1319]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1330]  -Generated helas calls for 1 subprocesses (123 diagrams) in 0.540 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.543 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -204,7 +204,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.526 s +ALOHA: aloha creates 5 routines in 0.383 s VVV1 VVV1 FFV1 @@ -226,6 +226,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttgg/src/. DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m2.076s -user 0m1.824s -sys 0m0.083s +real 0m1.937s +user 0m1.826s +sys 0m0.087s diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/timermap.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 510108d820..63058c2852 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006801605224609375  +DEBUG: model prefixing takes 0.0068781375885009766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.385 s +1 processes with 1240 diagrams generated in 2.374 s Total: 1 processes with 1240 diagrams output madevent CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_SA_OUTPUT @@ -171,7 +171,7 @@ INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1592 term in 40s. Introduce 2768 contraction -DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6126]  INFO: Creating files in directory . DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1199]  DEBUG: self.include_multi_channel is already defined: this is madevent+second_exporter mode [model_handling.py at line 1201]  @@ -212,15 +212,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: Done [export_cpp.py at line 713]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 8.383 s -Wrote files for 2281 helas calls in 53.334 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 8.397 s +Wrote files for 2281 helas calls in 53.001 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.414 s +ALOHA: aloha creates 5 routines in 0.381 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -228,7 +228,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.818 s +ALOHA: aloha creates 10 routines in 0.373 s VVV1 VVV1 FFV1 @@ -260,6 +260,6 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 1m9.884s -user 1m8.146s -sys 0m1.174s +real 1m8.971s +user 1m7.945s +sys 0m0.993s diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f index 374d8fcf08..27dcac90bb 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f @@ -88,7 +88,9 @@ Program DRIVER call cpu_time(t_before) CUMULATED_TIMING = t_before +#ifdef _OPENMP CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD() +#endif CALL COUNTERS_INITIALISE() c#ifdef MG5AMC_MEEXPORTER_CUDACPP diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc index e319a0a926..ee05a1162c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc @@ -8,6 +8,7 @@ // Hence use 'extern "C"' to avoid name mangling by the C++ compiler // See https://www.geeksforgeeks.org/extern-c-in-c +#ifdef _OPENMP extern "C" { void ompnumthreads_not_set_means_one_thread_() @@ -16,3 +17,4 @@ extern "C" ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file } } +#endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile index b7e084145e..365e9d0ed2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/makefile @@ -94,8 +94,15 @@ else all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (#503) endif +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = -fopenmp +LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy' +else +override OMPFLAGS = -fopenmp +endif + $(PROG): $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o - $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o $(LDFLAGS) + $(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS) $(LIBS): .libs @@ -122,17 +129,17 @@ endif # Also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs - $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) - if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi + $(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS) + if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi counters.o: counters.cc timer.h $(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@ ompnumthreads.o: ompnumthreads.cc ompnumthreads.h - $(CXX) -std=c++11 -Wall -Wshadow -Wextra -fopenmp -c $< -o $@ + $(CXX) -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@ $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL) - $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp + $(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS) gensym: $(SYMMETRY) configs.inc $(LIBS) $(FC) -o gensym $(SYMMETRY) -L$(LIBDIR) $(LINKLIBS) $(LDFLAGS) @@ -150,11 +157,11 @@ endif # Add source so that the compiler finds the DiscreteSampler module. $(MATRIX): %.o: %.f - $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp + $(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ $(OMPFLAGS) %.o: %.f - $(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c $< -I../../Source/ $(OMPFLAGS) -o $@ %_cudacpp.o: %.f - $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ -fopenmp -o $@ + $(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@ # Dependencies diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/timermap.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 2092433d03..1061975627 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006810188293457031  +DEBUG: model prefixing takes 0.006824493408203125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -150,7 +150,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 3.188 s +1 processes with 1240 diagrams generated in 2.386 s Total: 1 processes with 1240 diagrams output standalone_cudacpp CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_SA_OUTPUT @@ -198,7 +198,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/G DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1308]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1319]  DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1330]  -Generated helas calls for 1 subprocesses (1240 diagrams) in 8.323 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 8.367 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.415 s +ALOHA: aloha creates 5 routines in 0.418 s VVV1 VVV1 FFV1 @@ -228,6 +228,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_ttggg/src/ DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m17.159s -user 0m16.185s -sys 0m0.167s +real 0m16.586s +user 0m16.415s +sys 0m0.145s diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/timermap.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/timermap.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 60555bdbe2..ca934243a6 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -179,6 +179,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_heft_gg_h/src DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.652s -user 0m0.544s -sys 0m0.080s +real 0m0.834s +user 0m0.528s +sys 0m0.098s diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 21621aa244..71bdcfd5e6 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -169,10 +169,12 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | grep ^Intel),) -override OMPFLAGS = # disable OpenMP MT on the Intel compiler (on gcc this requires gcc>=9.3, issue #269) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else -override OMPFLAGS = -fopenmp # enable OpenMP MT #575 +override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT (default before #575) endif @@ -599,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),) $(testmain): LIBFLAGS += -L$(patsubst %bin/clang++,%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1)) endif +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -lgomp $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/timermap.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/timermap.h index 473d387dfe..60d8c51021 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/timermap.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/timermap.h @@ -81,7 +81,7 @@ namespace mgOnGpu maxsize = std::max( maxsize, ip.first.size() ); maxsize = std::max( maxsize, totalKey.size() ); // Compute the overall total - size_t ipart = 0; + //size_t ipart = 0; float total = 0; //float totalBut2 = 0; float total123 = 0; @@ -100,7 +100,7 @@ namespace mgOnGpu if( ip.first[0] == '2' ) total2 += ip.second; if( ip.first[0] == '3' ) total3 += ip.second; if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; - ipart++; + //ipart++; } // Dump individual partition timers and the overall total if( json ) From 5bf24cb350529eda4da416df95ea555a4c4c4ebd Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 15:12:38 +0100 Subject: [PATCH 39/42] [omp] in ggttsa disable openmp also in Apple clangy --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp From e58d0f6cb5b8d6aa62613cfaeed665814615c846 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 15:14:59 +0100 Subject: [PATCH 40/42] [omp] in CODEGEN backport Apple clang fixes for omp --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index faaf1033ba..16f5ff6557 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp From 61f9d01d588ccac742f9d7144a9c561ad57ae5cc Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 15:15:59 +0100 Subject: [PATCH 41/42] [omp] regenerate ggtt sa, all ok stable --- epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 3aab08d216..5a8f6c4e04 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -57,7 +57,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.006810903549194336  +DEBUG: model prefixing takes 0.006845235824584961  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -197,7 +197,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.172 s +ALOHA: aloha creates 2 routines in 0.175 s VVV1 FFV1 FFV1 @@ -211,6 +211,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_gg_tt/src/. a DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  quit -real 0m0.773s -user 0m0.683s -sys 0m0.065s +real 0m0.785s +user 0m0.693s +sys 0m0.064s From 23be18ec2f24b6290ea3095a70fc9e36cffd2cbf Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 19 Dec 2022 15:17:38 +0100 Subject: [PATCH 42/42] [omp] ** COMPLETE OMP ** copy cudacpp.mk from ggtt.sa to the other 5 sa and 5 mad --- epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk | 6 +++--- 10 files changed, 30 insertions(+), 30 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 71bdcfd5e6..5d8e0f1b89 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -171,7 +171,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc) else override OMPFLAGS = -fopenmp @@ -436,7 +436,7 @@ endif # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) # This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),) +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option ###ifneq ($(NVCC),) ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option @@ -604,7 +604,7 @@ endif ifneq ($(OMPFLAGS),) ifneq ($(shell $(CXX) --version | egrep '^Intel'),) ###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),) ###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp... else $(testmain): LIBFLAGS += -lgomp