madgraph5 · valassi · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/ompnumthreads.cc b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/ompnumthreads.cc
@@ -8,6 +8,7 @@
 // Hence use 'extern "C"' to avoid name mangling by the C++ compiler
 // See https://www.geeksforgeeks.org/extern-c-in-c
 
+#ifdef _OPENMP
 extern "C"
 {
   void ompnumthreads_not_set_means_one_thread_()
@@ -16,3 +17,4 @@ extern "C"
     ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
   }
 }
+#endif
diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1 b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.P1
@@ -1,5 +1,5 @@
 diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
-index 62b656862..0ae2524b4 100644
+index aa01cb976..50d82f805 100644
 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
 +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
 @@ -463,23 +463,140 @@ C
@@ -157,11 +157,11 @@ index 62b656862..0ae2524b4 100644
        END
 
 diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
-index 295e7de8d..19aa50965 100644
+index a76de8ec5..ab38b2202 100644
 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
 +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
-@@ -74,13 +74,52 @@ c      common/to_colstats/ncols,ncolflow,ncolalt,ic
-       include 'vector.inc'
+@@ -74,13 +74,54 @@ c      common/to_colstats/ncols,ncolflow,ncolalt,ic
+       include 'vector.inc' ! needed by coupl.inc (defines VECSIZE_MEMMAX)
        include 'coupl.inc'
        INTEGER VECSIZE_USED
 -      DATA VECSIZE_USED/VECSIZE_MEMMAX/ ! can be changed at runtime
@@ -179,7 +179,9 @@ index 295e7de8d..19aa50965 100644
        call cpu_time(t_before)
        CUMULATED_TIMING = t_before
 +
++#ifdef _OPENMP
 +      CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD()
++#endif
 +      CALL COUNTERS_INITIALISE()
 +
 +c#ifdef MG5AMC_MEEXPORTER_CUDACPP
@@ -214,7 +216,7 @@ index 295e7de8d..19aa50965 100644
  c
  c     Read process number
  c
-@@ -135,7 +174,8 @@ c   If CKKW-type matching, read IS Sudakov grid
+@@ -135,7 +176,8 @@ c   If CKKW-type matching, read IS Sudakov grid
            exit
   30       issgridfile='../'//issgridfile
            if(i.eq.5)then
@@ -224,7 +226,7 @@ index 295e7de8d..19aa50965 100644
              stop
            endif
          enddo
-@@ -202,8 +242,33 @@ c      call sample_result(xsec,xerr)
+@@ -202,8 +244,33 @@ c      call sample_result(xsec,xerr)
  c      write(*,*) 'Final xsec: ',xsec
 
        rewind(lun)
@@ -259,7 +261,7 @@ index 295e7de8d..19aa50965 100644
        end
 
  c     $B$ get_user_params $B$ ! tag for MadWeight
-@@ -381,7 +446,7 @@ c
+@@ -381,7 +448,7 @@ c
        fopened=.false.
        tempname=filename 	 
        fine=index(tempname,' ') 	 

diff --git a/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common b/epochX/cudacpp/CODEGEN/MG5aMC_patches/PROD/patch.common
@@ -24,7 +24,7 @@ index a6907622e..3c1e4fdf8 100644
 +      PARAMETER (VECSIZE_MEMMAX=16384) ! NB: 16k events per GPU grid is the minimum required to fill a V100 GPU
 +c     PARAMETER (VECSIZE_MEMMAX=32) ! NB: workaround for out-of-memory on Juwels: 32 is enough for no-CUDA builds (issue #498)
 diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
-index dd709f52c..b7e084145 100644
+index dd709f52c..365e9d0ed 100644
 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
 +++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/makefile
 @@ -1,6 +1,19 @@
@@ -75,7 +75,7 @@ index dd709f52c..b7e084145 100644
 
  LIBS = $(LIBDIR)libbias.$(libext) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libdsample.$(libext) $(LIBDIR)libgeneric.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libmodel.$(libext) $(LIBDIR)libcernlib.$(libext) $(MADLOOP_LIB) $(LOOP_LIBS)
 
-@@ -43,24 +75,69 @@ ifeq ($(strip $(MATRIX_HEL)),)
+@@ -43,24 +75,76 @@ ifeq ($(strip $(MATRIX_HEL)),)
  endif
 
 
@@ -103,8 +103,15 @@ index dd709f52c..b7e084145 100644
 +all: $(PROG) $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp # also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (#503)
 +endif
 +
++ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
++override OMPFLAGS = -fopenmp
++LINKLIBS += -lintlc # undefined reference to `_intel_fast_memcpy'
++else
++override OMPFLAGS = -fopenmp
++endif
++
 +$(PROG): $(PROCESS) $(DSIG) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o
-+	$(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o $(LDFLAGS)
++	$(FC) -o $(PROG) $(PROCESS) $(DSIG) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o $(LDFLAGS)
 +
 +$(LIBS): .libs
 +
@@ -131,17 +138,18 @@ index dd709f52c..b7e084145 100644
 +
 +# Also builds g$(PROG)_cudacpp if $(CUDACPP_CULIB) exists (improved patch for cpp-only builds #503)
 +$(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp: $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(LIBS) $(MATRIX) counters.o ompnumthreads.o $(CUDACPP_BUILDDIR)/.cudacpplibs
-+	$(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
-+	if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) -fopenmp counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi
++	$(FC) -o $(CUDACPP_BUILDDIR)/c$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CXXLIB) $(LIBFLAGSRPATH) $(LDFLAGS)
++	if [ -f $(LIBDIR)/$(CUDACPP_BUILDDIR)/lib$(CUDACPP_CULIB).* ]; then $(FC) -o $(CUDACPP_BUILDDIR)/g$(PROG)_cudacpp $(PROCESS) $(DSIG_cudacpp) auto_dsig.o $(MATRIX) $(LINKLIBS) $(BIASDEPENDENCIES) $(OMPFLAGS) counters.o ompnumthreads.o -L$(LIBDIR)/$(CUDACPP_BUILDDIR) -l$(CUDACPP_COMMONLIB) -l$(CUDACPP_CULIB) $(LIBFLAGSRPATH) $(LDFLAGS); fi
 +
 +counters.o: counters.cc timer.h
 +	$(CXX) -std=c++11 -Wall -Wshadow -Wextra -c $< -o $@
 +
 +ompnumthreads.o: ompnumthreads.cc ompnumthreads.h
-+	$(CXX) -std=c++11 -Wall -Wshadow -Wextra -fopenmp -c $< -o $@
++	$(CXX) -std=c++11 -Wall -Wshadow -Wextra $(OMPFLAGS) -c $< -o $@
 
  $(PROG)_forhel: $(PROCESS) auto_dsig.o $(LIBS) $(MATRIX_HEL)
- 	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
+-	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) -fopenmp
++	$(FC) -o $(PROG)_forhel $(PROCESS) $(MATRIX_HEL) $(LINKLIBS) $(LDFLAGS) $(BIASDEPENDENCIES) $(OMPFLAGS)
 
  gensym: $(SYMMETRY) configs.inc $(LIBS)
 -	$(FC) -o gensym $(SYMMETRY) -L../../lib/ $(LINKLIBS) $(LDFLAGS)
@@ -151,24 +159,25 @@ index dd709f52c..b7e084145 100644
  $(LIBDIR)libmodel.$(libext): ../../Cards/param_card.dat
  	cd ../../Source/MODEL; make
 
-@@ -69,12 +146,15 @@ $(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
+@@ -69,12 +153,15 @@ $(LIBDIR)libgeneric.$(libext): ../../Cards/run_card.dat
 
  $(LIBDIR)libpdf.$(libext): 
  	cd ../../Source/PDF; make
 +endif
 
  # Add source so that the compiler finds the DiscreteSampler module.
  $(MATRIX): %.o: %.f
- 	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp
+-	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ -fopenmp
++	$(FC) $(FFLAGS) $(MATRIX_FLAG) -c $< -I../../Source/ $(OMPFLAGS)
  %.o: %.f
 -	$(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp
-+	$(FC) $(FFLAGS) -c $< -I../../Source/ -fopenmp -o $@
++	$(FC) $(FFLAGS) -c $< -I../../Source/ $(OMPFLAGS) -o $@
 +%_cudacpp.o: %.f
-+	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ -fopenmp -o $@
++	$(FC) $(FFLAGS) -c -DMG5AMC_MEEXPORTER_CUDACPP $< -I../../Source/ $(OMPFLAGS) -o $@
 
  # Dependencies
 
-@@ -94,5 +174,71 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
+@@ -94,5 +181,71 @@ unwgt.o: genps.inc nexternal.inc symswap.inc cluster.inc run.inc message.inc \
  	 run_config.inc
  initcluster.o: message.inc
 

diff --git a/...X/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/...X/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -12,6 +12,7 @@
 #include "RamboSamplingKernels.h"
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
+#include "ompnumthreads.h"
 #include "timermap.h"
 
 #include <unistd.h>

diff --git a/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -169,11 +169,14 @@ endif
 #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
 
 # Set the default OMPFLAGS choice
-ifneq ($(shell $(CXX) --version | grep ^Intel),)
-override OMPFLAGS = # disable OpenMP on the Intel compiler (on gcc this requires gcc>=9.3, issue #269)
+ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+override OMPFLAGS = # disable OpenMP MT on Intel (ok without nvcc, not ok with nvcc)
+else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),)
+override OMPFLAGS = # disable OpenMP MT on clang (not ok without or with nvcc)
+else
+override OMPFLAGS = -fopenmp
+###override OMPFLAGS = # disable OpenMP MT (default before #575)
 endif
-###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT)
-override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT)
 
 # Set the default AVX (vectorization) choice
 ifeq ($(AVX),)
@@ -433,7 +436,7 @@ endif
 
 # Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516)
 # This patch does remove the warning, but I prefer to keep it disabled for the moment...
-###ifneq ($(shell $(CXX) --version | egrep '^(clang|Intel)'),)
+###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
 ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
 ###ifneq ($(NVCC),)
 ###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
@@ -529,7 +532,7 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg
 endif
 $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
-	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS)
+	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS)
 
 ifneq ($(NVCC),)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
@@ -598,14 +601,24 @@ ifneq ($(shell $(CXX) --version | grep ^clang),)
 $(testmain): LIBFLAGS += -L$(patsubst %%bin/clang++,%%lib,$(shell which $(firstword $(subst ccache ,,$(CXX))) | tail -1))
 endif
 
+ifneq ($(OMPFLAGS),)
+ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+###$(testmain): LIBFLAGS += -qopenmp -static-intel # see https://stackoverflow.com/questions/45909648/explicitly-link-intel-icpc-openmp
+else ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang)'),)
+###$(testmain): LIBFLAGS += ??? # OpenMP on clang is not yet supported in cudacpp...
+else
+$(testmain): LIBFLAGS += -lgomp
+endif
+endif
+
 ifeq ($(NVCC),) # link only runTest.o
 $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
 else # link both runTest.o and runTest_cu.o
 $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -lgomp $(CULIBFLAGS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS)
 endif
 
 # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215

diff --git a/...PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/...PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -96,16 +96,19 @@
 #else
     const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
 #endif
-    /*
 #ifdef _OPENMP
-    // (NB gcc9 or higher, or clang, is required)
+    // OMP multithreading #575 (NB: tested only with gcc11 so far)
+    // See https://www.openmp.org/specifications/
     // - default(none): no variables are shared by default
     // - shared: as the name says
     // - private: give each thread its own copy, without initialising
     // - firstprivate: give each thread its own copy, and initialise with value from outside
-#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 )
+#else
+#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 )
+#endif
 #endif // _OPENMP
-    */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
       // Running sum of partial amplitudes squared for event by event color selection (#402)

diff --git a/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h b/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/timermap.h
@@ -81,7 +81,7 @@ namespace mgOnGpu
         maxsize = std::max( maxsize, ip.first.size() );
       maxsize = std::max( maxsize, totalKey.size() );
       // Compute the overall total
-      size_t ipart = 0;
+      //size_t ipart = 0;
       float total = 0;
       //float totalBut2 = 0;
       float total123 = 0;
@@ -100,7 +100,7 @@ namespace mgOnGpu
         if( ip.first[0] == '2' ) total2 += ip.second;
         if( ip.first[0] == '3' ) total3 += ip.second;
         if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second;
-        ipart++;
+        //ipart++;
       }
       // Dump individual partition timers and the overall total
       if( json )

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -57,7 +57,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0068399906158447266 [0m
+[1;32mDEBUG: model prefixing  takes 0.006861448287963867 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -168,7 +168,7 @@ INFO: Organizing processes into subprocess groups
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_ll_ll 
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1a6cfcedf0> [1;30m[export_v4.py at line 6126][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7efceb51bdf0> [1;30m[export_v4.py at line 6126][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1199][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1201][0m [0m
@@ -201,19 +201,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group ll_ll 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.005 s
-Wrote files for 8 helas calls in 0.118 s
+Wrote files for 8 helas calls in 0.117 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.240 s
+ALOHA: aloha creates 3 routines in  0.241 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 181][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.305 s
+ALOHA: aloha creates 7 routines in  0.307 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -241,6 +241,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.425s
-user	0m2.110s
-sys	0m0.293s
+real	0m2.451s
+user	0m2.118s
+sys	0m0.303s
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/CPPProcess.cc
@@ -901,16 +901,19 @@ namespace mg5amcCpu
 #else
     const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
 #endif
-    /*
 #ifdef _OPENMP
-    // (NB gcc9 or higher, or clang, is required)
+    // OMP multithreading #575 (NB: tested only with gcc11 so far)
+    // See https://www.openmp.org/specifications/
     // - default(none): no variables are shared by default
     // - shared: as the name says
     // - private: give each thread its own copy, without initialising
     // - firstprivate: give each thread its own copy, and initialise with value from outside
-#pragma omp parallel for default( none ) shared( allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#pragma omp parallel for default( none ) shared( allcouplings, allDenominators, allMEs, allmomenta, allNumerators, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, channelId, cNGoodHel, mgOnGpu::icolamp, MEs_ighel, npagV2 )
+#else
+#pragma omp parallel for default( none ) shared( allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, MEs_ighel, npagV2 )
+#endif
 #endif // _OPENMP
-    */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
       // Running sum of partial amplitudes squared for event by event color selection (#402)

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/check_sa.cc
@@ -12,6 +12,7 @@
 #include "RamboSamplingKernels.h"
 #include "RandomNumberKernels.h"
 #include "epoch_process_id.h"
+#include "ompnumthreads.h"
 #include "timermap.h"
 
 #include <unistd.h>

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/driver.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/driver.f
@@ -88,7 +88,9 @@ Program DRIVER
       call cpu_time(t_before)
       CUMULATED_TIMING = t_before
 
+#ifdef _OPENMP
       CALL OMPNUMTHREADS_NOT_SET_MEANS_ONE_THREAD()
+#endif
       CALL COUNTERS_INITIALISE()
 
 c#ifdef MG5AMC_MEEXPORTER_CUDACPP

diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/ompnumthreads.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_ll_ll/ompnumthreads.cc
@@ -8,6 +8,7 @@
 // Hence use 'extern "C"' to avoid name mangling by the C++ compiler
 // See https://www.geeksforgeeks.org/extern-c-in-c
 
+#ifdef _OPENMP
 extern "C"
 {
   void ompnumthreads_not_set_means_one_thread_()
@@ -16,3 +17,4 @@ extern "C"
     ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
   }
 }
+#endif