From d2e2f47a303a9b1c25805d96d04beb4f07b57575 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 10:16:55 +0100 Subject: [PATCH] [jt774] (before merging upstream/master) improve logic of "if CUDA else HIP else neither" in CODEGEN cudacpp.mk --- .../iolibs/template_files/gpu/cudacpp.mk | 228 +++++++++--------- 1 file changed, 108 insertions(+), 120 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 965c0e36bf..2864673ead 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -30,7 +30,7 @@ UNAME_P := $(shell uname -p) include ../../Source/make_opts #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here @@ -104,69 +104,73 @@ endif #------------------------------------------------------------------------------- -CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler") -HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler") - -ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc) - #=== Configure the CUDA compiler - - # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) - # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below - ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") - override CUDA_HOME=disabled - endif - - # If CUDA_HOME is not set, try to set it from the location of NVCC - ifndef CUDA_HOME - CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) - $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") - endif - - # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists - ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - GPUCC = $(CUDA_HOME)/bin/nvcc - USE_NVTX ?=-DUSE_NVTX - # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html - # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # Embed device code for 70, and PTX for 70+. - # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). - # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 - comma:=, - CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) - CUINC = -I$(CUDA_HOME)/include/ - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - CUOPTFLAGS = -lineinfo - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h - # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) - - CUBUILDRULEFLAGS = -Xcompiler -fPIC -c - CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu - - CUDATESTFLAGS = -lcuda - - else ifneq ($(origin REQUIRE_CUDA),undefined) - # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) - else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ - $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override GPUCC= - override USE_NVTX= - override CUINC= - override CURANDLIBFLAGS= - endif +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below +ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + override CUDA_HOME=disabled + override HIP_HOME=disabled +endif + +# If CUDA_HOME is not set, try to set it from the path to nvcc +ifndef CUDA_HOME + CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) + $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") +endif + +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists +ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) + + GPUCC = $(CUDA_HOME)/bin/nvcc + USE_NVTX ?=-DUSE_NVTX + # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). + # Embed device code for 70, and PTX for 70+. + # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). + # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). + MADGRAPH_CUDA_ARCHITECTURE ?= 70 + ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + comma:=, + CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + CUINC = -I$(CUDA_HOME)/include/ + CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! + CUOPTFLAGS = -lineinfo + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda # Set the host C++ compiler for GPUCC via "-ccbin " # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) @@ -177,71 +181,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc) GPUFLAGS += -allow-unsupported-compiler endif -else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) - #=== Configure the HIP compiler +else ifneq ($(origin REQUIRE_CUDA),undefined) - # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505) - # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below - ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning HIP builds are not supported for multi-word CXX "$(CXX)") - override HIP_HOME=disabled - endif + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) - # If HIP_HOME is not set, try to set it from the location of GPUCC - ifndef HIP_HOME - HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH)) - $(warning HIP_HOME was not set: using "$(HIP_HOME)") - endif +#--- Option 2: CUDA does not exist, HIP exists -> use HIP - # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists - ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) - GPUCC = $(HIP_HOME)/bin/hipcc - - # Should maybe find something equivelant to this in HIP - #USE_NVTX ?=-DUSE_NVTX - - HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a - HIPINC = -I$(HIP_HOME)/include/ - - # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP - # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC - ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - GPUFLAGS += -std=c++17 - # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) - - CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c - - else ifneq ($(origin REQUIRE_HIP),undefined) - # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) - else - # No hip. Switch hip compilation off and go to common random numbers in C++ - $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) - override GPUCC= - override USE_NVTX= - override CUINC= - override CURANDLIBFLAGS= - endif +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) - # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) - ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) - GPUFLAGS += -allow-unsupported-compiler - endif + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + +else + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ + $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= + override USE_NVTX= + override CUINC= + override CURANDLIBFLAGS= endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) export GPUCC export GPUFLAGS #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -258,7 +246,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -274,7 +262,7 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) GPUFLAGS+= -Xcompiler -mno-float128 endif @@ -360,7 +348,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS))