madgraph5 · valassi · Aug 27, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/...acpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/...acpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h
@@ -10,7 +10,7 @@
 
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
-#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
+#include "MemoryBuffers.h" // for HostBufferGs::isaligned
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL

diff --git a/...CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/...CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h
@@ -8,7 +8,7 @@
 
 #include "mgOnGpuConfig.h"
 
-#include "CPPProcess.h"
+#include "CPPProcess.h" // for CPPProcess::np4 and CPPProcess::npar (NB: npar may differ in different P* subprocess directories!)
 #include "MemoryAccessHelpers.h"
 #include "MemoryAccessVectors.h"
 

diff --git a/...X/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/...X/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -971,6 +971,8 @@ main( int argc, char** argv )
               << " [" << process.getCompiler() << "]"
 #ifdef MGONGPU_INLINE_HELAMPS
               << " [inlineHel=1]"
+#elif defined MGONGPU_LINKER_HELAMPS
+              << " [inlineHel=L]"
 #else
               << " [inlineHel=0]"
 #endif

diff --git a/.../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc2.inc b/.../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc2.inc
@@ -0,0 +1,65 @@
+// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Aug 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+
+#ifdef MGONGPU_LINKER_HELAMPS
+
+#include "HelAmps_sm.h"
+
+// -----------------------------------------------------------------------------
+// *** NB: this implementation class depends on MemoryAccessMomenta,
+// *** where the AOSOA definition depends on CPPProcess::npar,
+// *** which may be different in different P* subprocess directories:
+// *** therefore this class is presently hosted and compiled in each P*
+// -----------------------------------------------------------------------------
+
+#include "MemoryAccessAmplitudes.h"
+#include "MemoryAccessCouplings.h"
+#include "MemoryAccessCouplingsFixed.h"
+#include "MemoryAccessGs.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessWavefunctions.h"
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#include "MemoryAccessDenominators.h"
+#include "MemoryAccessNumerators.h"
+#endif
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
+  using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
+  using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+  using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+  using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+  using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  using NUM_ACCESS = DeviceAccessNumerators;   // non-trivial access: buffer includes all events
+  using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events
+#endif
+#else
+  using namespace ::mg5amcCpu;
+  using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
+  using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
+  using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+  using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+  using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+  using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+  using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+%(function_definitions2)s}
+#endif
diff --git a/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -556,8 +556,11 @@ $(info HELINL='$(HELINL)')
 ifeq ($(HELINL),1)
   CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
   GPUFLAGS += -DMGONGPU_INLINE_HELAMPS
+else ifeq ($(HELINL),L)
+  CXXFLAGS += -DMGONGPU_LINKER_HELAMPS
+  GPUFLAGS += -DMGONGPU_LINKER_HELAMPS
 else ifneq ($(HELINL),0)
-  $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
+  $(error Unknown HELINL='$(HELINL)': only 'L,', '0' and '1' are supported)
 endif
 
 # Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1")
@@ -660,7 +663,6 @@ override RUNTIME =
 #=== Makefile TARGETS and build rules below
 #===============================================================================
 
-
 ifeq ($(GPUCC),)
   cxx_checkmain=$(BUILDDIR)/check_cpp.exe
   cxx_fcheckmain=$(BUILDDIR)/fcheck_cpp.exe
@@ -789,6 +791,19 @@ gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementK
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
+# Add object files and special build flags only for the HELINL=L mode
+ifeq ($(HELINL),L)
+cxx_objects_lib+=$(BUILDDIR)/HelAmps_cpp.o
+gpu_objects_lib+=$(BUILDDIR)/HelAmps_$(GPUSUFFIX).o
+ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o: GPUFLAGS += -rdc true # compilation fails if this is not added (ptxas fatal: Unresolved extern function)
+$(BUILDDIR)/HelAmps_$(GPUSUFFIX).o: GPUFLAGS += -rdc true # runtime fails if this is not added ('invalid device symbol' in CPPProcess.cc cHel to tHel copy)
+else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o: GPUFLAGS += -fgpu-rdc # compilation fails if this is not added (lld: error: undefined hidden symbol: mg5amcGpu::linker_CD_FFV1_0)
+$(gpu_checkmain): LIBFLAGS += -fgpu-rdc --hip-link
+endif
+endif
+
 # Target (and build rules): C++ and CUDA/HIP shared libraries
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge_cpp.o
 $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge_cpp.o
@@ -799,12 +814,12 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPUARCHFLAGS) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
-#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
+#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPUARCHFLAGS) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
 #else
-#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+#	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPUARCHFLAGS) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 #endif
 endif
 
@@ -975,6 +990,7 @@ $(cxx_testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY
 $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
 	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS)
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
+$(gpu_testmain): LIBFLAGS += $(GPUARCHFLAGS) # avoid "nvlink warning: SM Arch not found" when using rdc
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)

diff --git a/...cpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk b/...cpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
@@ -40,7 +40,7 @@ ifneq ($(words $(filter $(FPTYPE), $(SUPPORTED_FPTYPES))),1)
   $(error Invalid fptype FPTYPE='$(FPTYPE)': supported fptypes are $(foreach fptype,$(SUPPORTED_FPTYPES),'$(fptype)'))
 endif
 
-override SUPPORTED_HELINLS = 0 1
+override SUPPORTED_HELINLS = L 0 1
 ifneq ($(words $(filter $(HELINL), $(SUPPORTED_HELINLS))),1)
   $(error Invalid helinl HELINL='$(HELINL)': supported helinls are $(foreach helinl,$(SUPPORTED_HELINLS),'$(helinl)'))
 endif

diff --git a/...dacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/...dacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -29,7 +29,7 @@
 
 // Choose if curand is supported for generating random numbers
 // For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead)
-// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+// For both CUDA and C++, by default, do not skip curand, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
 // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785)
 #if defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
@@ -45,7 +45,7 @@
 
 // Choose if hiprand is supported for generating random numbers
 // For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead)
-// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND
+// For both HIP and C++, by default, do not skip hiprand, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND
 // (there may exist HIP installations which do not include hiprand?)
 #if defined __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #define MGONGPU_HAS_NO_HIPRAND 1
@@ -77,9 +77,16 @@
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
+// (NB: MGONGPU_INLINE_HELAMPS and MGONGPU_LINKER_HELAMPS are mutually exclusive)
 //#undef MGONGPU_INLINE_HELAMPS // default
 ////#define MGONGPU_INLINE_HELAMPS 1
 
+// Choose whether to compile and link all HelAmps functions as separate object files
+// By default, do not link, but allow this macro to be set from outside with e.g. -DMGONGPU_LINKER_HELAMPS
+// (NB: MGONGPU_INLINE_HELAMPS and MGONGPU_LINKER_HELAMPS are mutually exclusive)
+//#undef MGONGPU_LINKER_HELAMPS // default
+////#define MGONGPU_LINKER_HELAMPS 1
+
 // Choose whether to hardcode the cIPD physics parameters rather than reading them from user cards
 // This optimization can gain 20%% in CUDA in eemumu (issue #39)
 // By default, do not hardcode, but allow this macro to be set from outside with e.g. -DMGONGPU_HARDCODE_PARAM
@@ -156,6 +163,11 @@
 #endif
 #endif
 
+// SANITY CHECKS (HelAmps)
+#if defined MGONGPU_INLINE_HELAMPS and defined MGONGPU_LINKER_HELAMPS
+#error You must CHOOSE (AT MOST) ONLY ONE of MGONGPU_INLINE_HELAMPS or defined MGONGPU_LINKER_HELAMPS
+#endif
+
 // NB: namespace mgOnGpu includes types which are defined in exactly the same way for CPU and GPU builds (see #318 and #725)
 namespace mgOnGpu
 {