diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index 0250c160ed..ef3d99d07c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -172,7 +172,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
 #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
-#ifdef __CUDACC__
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
 #endif
@@ -194,9 +194,9 @@ namespace mg5amcCpu
 %(dcoupsetdcoup)s
       }
 %(eftspecial2)s
-      return out;
-    }
-#ifdef __CUDACC__
+    return out;
+  }
+#ifdef MGONGPUCPP_GPUIMPL
 #pragma GCC diagnostic pop
 #pragma nv_diagnostic pop
 #endif
@@ -212,6 +212,12 @@ namespace mg5amcCpu
 
   //==========================================================================
 
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
 #pragma GCC diagnostic push
 #ifndef __clang__
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
deleted file mode 100644
index df0c3f3df8..0000000000
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin.
-
-#ifndef MG5AMC_CUDARUNTIME_H
-#define MG5AMC_CUDARUNTIME_H 1
-
-// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
-// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
-
-#include <cassert>
-#include <iostream>
-
-//--------------------------------------------------------------------------
-
-// See https://stackoverflow.com/a/14038590
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
-inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
-{
-  if( code != cudaSuccess )
-  {
-    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
-    if( abort ) assert( code == cudaSuccess );
-  }
-}
-#endif /* clang-format on */
-
-//--------------------------------------------------------------------------
-
-#ifdef MGONGPUCPP_GPUIMPL
-namespace mg5amcGpu
-{
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
-  struct CudaRuntime final
-  {
-    CudaRuntime( const bool debug = true )
-      : m_debug( debug ) { setUp( m_debug ); }
-    ~CudaRuntime() { tearDown( m_debug ); }
-    CudaRuntime( const CudaRuntime& ) = delete;
-    CudaRuntime( CudaRuntime&& ) = delete;
-    CudaRuntime& operator=( const CudaRuntime& ) = delete;
-    CudaRuntime& operator=( CudaRuntime&& ) = delete;
-    bool m_debug;
-
-    // Set up CUDA application
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
-    {
-      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
-      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
-      /*
-      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
-      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
-      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
-      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
-      checkCuda( cudaFree( 0 ) ); // SLOW!
-      */
-      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
-      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
-      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
-      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
-    }
-
-    // Tear down CUDA application (call cudaDeviceReset)
-    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
-    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
-    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
-    {
-      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
-      checkCuda( cudaDeviceReset() );
-    }
-  };
-
-}
-#endif
-
-//--------------------------------------------------------------------------
-
-#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
index 9a39220077..491dfc02e1 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc
@@ -12,6 +12,7 @@
 #include "BridgeKernels.h"
 #include "CPPProcess.h"
 #include "CrossSectionKernels.h"
+#include "GpuRuntime.h"
 #include "MatrixElementKernels.h"
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
@@ -102,12 +103,12 @@ main( int argc, char** argv )
     CurandHost = 1,
     CurandDevice = 2
   };
-#ifdef MGONGPUCPP_GPUIMPL
-  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+#ifdef __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU
 #elif not defined MGONGPU_HAS_NO_CURAND
   RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
 #else
-  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU
 #endif
   // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
   enum class RamboSamplingMode
@@ -145,10 +146,10 @@ main( int argc, char** argv )
     }
     else if( arg == "--curdev" )
     {
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
       rndgen = RandomNumberMode::CurandDevice;
 #else
-      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+      throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" );
 #endif
     }
     else if( arg == "--curhst" )
@@ -265,12 +266,12 @@ main( int argc, char** argv )
 
 #ifdef MGONGPUCPP_GPUIMPL
 
-  // --- 00. Initialise cuda
-  // Instantiate a CudaRuntime at the beginnining of the application's main to
-  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
-  const std::string cdinKey = "00 CudaInit";
+  // --- 00. Initialise GPU
+  // Instantiate a GpuRuntime at the beginnining of the application's main.
+  // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor.
+  const std::string cdinKey = "00 GpuInit";
   timermap.start( cdinKey );
-  CudaRuntime cudaRuntime( debug );
+  GpuRuntime GpuRuntime( debug );
 #endif
 
   // --- 0a. Initialise physics process
@@ -394,7 +395,7 @@ main( int argc, char** argv )
     const bool onDevice = false;
     prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
   }
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   else
   {
     const bool onDevice = true;
@@ -403,7 +404,7 @@ main( int argc, char** argv )
 #else
   else
   {
-    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+    throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement)
   }
 #endif
 #else
@@ -729,17 +730,21 @@ main( int argc, char** argv )
     rndgentxt = "CURAND HOST";
   else if( rndgen == RandomNumberMode::CurandDevice )
     rndgentxt = "CURAND DEVICE";
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
   rndgentxt += " (CUDA code)";
+#elif defined __HIPCC__
+  rndgentxt += " (HIP code)";
 #else
   rndgentxt += " (C++ code)";
 #endif
 
   // Workflow description summary
   std::string wrkflwtxt;
-  // -- CUDA or C++?
-#ifdef MGONGPUCPP_GPUIMPL
+  // -- CUDA or HIP or C++?
+#ifdef __CUDACC__
   wrkflwtxt += "CUD:";
+#elif defined __HIPCC__
+  wrkflwtxt += "HIP:";
 #else
   wrkflwtxt += "CPP:";
 #endif
@@ -754,7 +759,7 @@ main( int argc, char** argv )
   wrkflwtxt += "???+"; // no path to this statement
 #endif
   // -- CUCOMPLEX or THRUST or STD complex numbers?
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
   wrkflwtxt += "CUX:";
 #elif defined MGONGPU_CUCXTYPE_THRUST
@@ -764,6 +769,12 @@ main( int argc, char** argv )
 #else
   wrkflwtxt += "???:"; // no path to this statement
 #endif
+#elif defined __HIPCC__
+#if defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
 #else
 #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
   wrkflwtxt += "STX:";
@@ -864,8 +875,10 @@ main( int argc, char** argv )
 #endif
     // Dump all configuration parameters and all results
     std::cout << std::string( SEP79, '*' ) << std::endl
-#ifdef MGONGPUCPP_GPUIMPL
+#ifdef __CUDACC__
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#elif defined __HIPCC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP"
 #else
               << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
 #endif
@@ -892,14 +905,14 @@ main( int argc, char** argv )
 #elif defined MGONGPU_FPTYPE_FLOAT
               << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
               << "Complex type                = CUCOMPLEX" << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
               << "Complex type                = THRUST::COMPLEX" << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
               << "Complex type                = STD::COMPLEX" << std::endl
+#else
+              << "Complex type                = ???" << std::endl // no path to this statement...
 #endif
               << "RanNumb memory layout       = AOSOA[" << neppR << "]"
               << ( neppR == 1 ? " == AOS" : "" )
@@ -1033,14 +1046,14 @@ main( int argc, char** argv )
              << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
-#ifdef MGONGPUCPP_GPUIMPL
 #if defined MGONGPU_CUCXTYPE_CUCOMPLEX
              << "\"CUCOMPLEX\"," << std::endl
 #elif defined MGONGPU_CUCXTYPE_THRUST
              << "\"THRUST::COMPLEX\"," << std::endl
-#endif
-#else
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
              << "\"STD::COMPLEX\"," << std::endl
+#else
+             << "\"???\"," << std::endl                           // no path to this statement...
 #endif
              << "\"RanNumb memory layout\": "
              << "\"AOSOA[" << neppR << "]\""
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 5b04029787..1811de4699 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -6,21 +6,31 @@
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
 
-#include "GpuRuntime.h" // Includes the GPU abstraction
-
 // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
 // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
 %(mgongpu_supports_multichannel)s
 
+// Is this a GPU (CUDA, HIP) or CPU implementation?
+#ifdef __CUDACC__
+#define MGONGPUCPP_GPUIMPL cuda
+#elif defined __HIPCC__
+#define MGONGPUCPP_GPUIMPL hip
+#else
+#undef MGONGPUCPP_GPUIMPL
+#endif
+
 // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
 // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
 
 // Choose if curand is supported for generating random numbers
+// For CUDA, by default, it is supported
+// For HIP, by default, it is not supported
 // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
-#ifdef MGONGPUCPP_CUDACC
+#ifdef __CUDACC__
 #undef MGONGPU_HAS_NO_CURAND
-#elif defined MGONGPUCPP_HIPCC
+#elif defined __HIPCC__
 #define MGONGPU_HAS_NO_CURAND 1
+#else
 //#undef MGONGPU_HAS_NO_CURAND // default
 ////#define MGONGPU_HAS_NO_CURAND 1
 #endif
@@ -55,23 +65,28 @@
 //#undef MGONGPU_HARDCODE_PARAM // default
 ////#define MGONGPU_HARDCODE_PARAM 1
 
-// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
-#ifndef MGONGPUCPP_GPUIMPL
-//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
-#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
-#endif
-
-// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
-#ifdef MGONGPUCPP_GPUIMPL
+// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+#ifdef __CUDACC__
 #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
 //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
 //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE)
+#elif defined __HIPCC__
+#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+
+// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#else
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
-#ifdef MGONGPUCPP_GPUIMPL
-#undef MGONGPU_NSIGHT_DEBUG // default
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+#ifdef __CUDACC__
+#undef MGONGPU_NSIGHT_DEBUG // default in CUDA
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
@@ -87,17 +102,21 @@
 #error You cannot use double precision for color algebra and single precision elsewhere
 #endif
 
-// SANITY CHECKS (c++ complex number implementation)
-#ifndef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+// SANITY CHECKS (CUDA complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA
+#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA
 #endif
 #endif
 
-// SANITY CHECKS (cuda complex number implementation)
-#ifdef MGONGPUCPP_GPUIMPL
-#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
-#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+// SANITY CHECKS (C++ complex number implementation)
+#ifndef MGONGPUCPP_GPUIMPL
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++
 #endif
 #endif
 
@@ -146,7 +165,7 @@ using mgOnGpu::fptype;
 using mgOnGpu::fptype2;
 
 // C++ SIMD vectorization width (this will be used to set neppV)
-#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD
+#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD
 #undef MGONGPU_CPPSIMD
 #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
 #ifdef MGONGPU_FPTYPE_DOUBLE
@@ -176,9 +195,9 @@ using mgOnGpu::fptype2;
 #undef MGONGPU_CPPSIMD
 #endif
 
-// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
 #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
 #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
 #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
@@ -190,7 +209,7 @@ using mgOnGpu::fptype2;
 #define mgDebugFinalise() { /*noop*/ }
 #endif /* clang-format on */
 
-// Define empty CUDA declaration specifiers for C++
+// Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
 #define __global__
 #define __host__
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index cacab1031a..ed3e219f8a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -69,6 +69,8 @@
 #ifdef __CUDACC__
 #undef MGONGPU_NSIGHT_DEBUG // default
 //#define MGONGPU_NSIGHT_DEBUG 1
+#else
+#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif
 
 // SANITY CHECKS (floating point precision for everything but color algebra #537)
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
index e1299ba81e..e91f5927d6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h
@@ -9,6 +9,8 @@
 #include "mgOnGpuCxtypes.h"
 #include "mgOnGpuFptypes.h"
 
+#include "GpuAbstraction.h"
+
 #include <iostream>
 
 //==========================================================================