MultithreadCorner
diff --git a/‎dependencies/cub b/‎dependencies/cub
diff --git a/‎examples/dot_products_with_zip.cu
+18-18 b/‎examples/dot_products_with_zip.cu
+18-18
diff --git a/‎thrust/detail/allocator/temporary_allocator.inl
+11-6 b/‎thrust/detail/allocator/temporary_allocator.inl
+11-6
diff --git a/‎thrust/detail/config/cpp_compatibility.h
+19-2 b/‎thrust/detail/config/cpp_compatibility.h
+19-2
diff --git a/‎thrust/detail/config/exec_check_disable.h
+5-1 b/‎thrust/detail/config/exec_check_disable.h
+5-1
diff --git a/‎thrust/detail/contiguous_storage.inl
+12-8 b/‎thrust/detail/contiguous_storage.inl
+12-8
diff --git a/‎thrust/detail/functional/actor.h
+1-1 b/‎thrust/detail/functional/actor.h
+1-1
diff --git a/‎thrust/detail/functional/actor.inl
+2-2 b/‎thrust/detail/functional/actor.inl
+2-2
diff --git a/‎thrust/detail/functional/argument.h
+1-1 b/‎thrust/detail/functional/argument.h
+1-1
diff --git a/‎thrust/detail/integer_math.h
+20-15 b/‎thrust/detail/integer_math.h
+20-15
diff --git a/‎thrust/detail/seq.h
+2-6 b/‎thrust/detail/seq.h
+2-6
diff --git a/‎thrust/execution_policy.h
+1-5 b/‎thrust/execution_policy.h
+1-5
@@ -6,9 +6,9 @@
 #include <thrust/random.h>
 
 
-// This example shows how thrust::zip_iterator can be used to create a 
-// 'virtual' array of structures.  In this case the structure is a 3d 
-// vector type (Float3) whose (x,y,z) components will be stored in 
+// This example shows how thrust::zip_iterator can be used to create a
+// 'virtual' array of structures.  In this case the structure is a 3d
+// vector type (Float3) whose (x,y,z) components will be stored in
 // three separate float arrays.  The zip_iterator "zips" these arrays
 // into a single virtual Float3 array.
 
@@ -54,17 +54,17 @@ int main(void)
     // We'll store the components of the 3d vectors in separate arrays. One set of
     // arrays will store the 'A' vectors and another set will store the 'B' vectors.
 
-    // This 'structure of arrays' (SoA) approach is usually more efficient than the 
+    // This 'structure of arrays' (SoA) approach is usually more efficient than the
     // 'array of structures' (AoS) approach.  The primary reason is that structures,
     // like Float3, don't always obey the memory coalescing rules, so they are not
     // efficiently transferred to and from memory.  Another reason to prefer SoA to
     // AoS is that we don't aways want to process all members of the structure.  For
-    // example, if we only need to look at first element of the structure then it 
+    // example, if we only need to look at first element of the structure then it
     // is wasteful to load the entire structure from memory.  With the SoA approach,
     // we can chose which elements of the structure we wish to read.
 
     thrust::device_vector<float> A0 = random_vector(N);  // x components of the 'A' vectors
-    thrust::device_vector<float> A1 = random_vector(N);  // y components of the 'A' vectors 
+    thrust::device_vector<float> A1 = random_vector(N);  // y components of the 'A' vectors
     thrust::device_vector<float> A2 = random_vector(N);  // z components of the 'A' vectors
 
     thrust::device_vector<float> B0 = random_vector(N);  // x components of the 'B' vectors
@@ -78,7 +78,7 @@ int main(void)
     // We'll now illustrate two ways to use zip_iterator to compute the dot
     // products.  The first method is verbose but shows how the parts fit together.
     // The second method hides these details and is more concise.
-   
+
 
     // METHOD #1
     // Defining a zip_iterator type can be a little cumbersome ...
@@ -87,24 +87,24 @@ int main(void)
     typedef thrust::zip_iterator<FloatIteratorTuple>                   Float3Iterator;
 
     // Now we'll create some zip_iterators for A and B
-    Float3Iterator A_first = thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin()));
-    Float3Iterator A_last  = thrust::make_zip_iterator(make_tuple(A0.end(),   A1.end(),   A2.end()));
-    Float3Iterator B_first = thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin()));
-                            
+    Float3Iterator A_first = thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin()));
+    Float3Iterator A_last  = thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end()));
+    Float3Iterator B_first = thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin()));
+
     // Finally, we pass the zip_iterators into transform() as if they
     // were 'normal' iterators for a device_vector<Float3>.
     thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct());
 
 
     // METHOD #2
-    // Alternatively, we can avoid creating variables for X_first, X_last, 
+    // Alternatively, we can avoid creating variables for X_first, X_last,
     // and Y_first and invoke transform() directly.
-    thrust::transform( thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin())),
-                       thrust::make_zip_iterator(make_tuple(A0.end(),   A1.end(),   A2.end())),
-                       thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin())),
+    thrust::transform( thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin())),
+                       thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end())),
+                       thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin())),
                        result.begin(),
                        DotProduct() );
-    
+
 
 
     // Finally, we'll print a few results
@@ -126,8 +126,8 @@ int main(void)
         std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << "," << thrust::get<2>(b) << ")";
         std::cout << " = ";
         std::cout << dot << std::endl;
-    }   
+    }
 
     return 0;
 }
- 
+
@@ -20,7 +20,8 @@
 #include <thrust/system/detail/bad_alloc.h>
 #include <cassert>
 
-#if defined(__CUDA_ARCH__) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#if (defined(__NVCOMPILER_CUDA__) || defined(__CUDA_ARCH__)) && \
+    THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 #include <thrust/system/cuda/detail/terminate.h>
 #endif
 
@@ -45,11 +46,15 @@ __host__ __device__
     // note that we pass cnt to deallocate, not a value derived from result.second
     deallocate(result.first, cnt);
 
-#if !defined(__CUDA_ARCH__)
-    throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
-#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
-#endif
+    if (THRUST_IS_HOST_CODE) {
+      #if THRUST_INCLUDE_HOST_CODE
+        throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
+      #endif
+    } else {
+      #if THRUST_INCLUDE_DEVICE_CODE
+        thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
+      #endif
+    }
   } // end if
 
   return result.first;
 
@@ -49,13 +49,13 @@
 // FIXME: Combine THRUST_INLINE_CONSTANT and
 // THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly
 // supports `constexpr` globals in host and device code.
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__NVCOMPILER_CUDA__)
 // FIXME: Add this when NVCC supports inline variables.
 //#  if   THRUST_CPP_DIALECT >= 2017
 //#    define THRUST_INLINE_CONSTANT                 inline constexpr
 //#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
 #  if THRUST_CPP_DIALECT >= 2011
-#    define THRUST_INLINE_CONSTANT                 static constexpr
+#    define THRUST_INLINE_CONSTANT                 static const __device__
 #    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
 #  else
 #    define THRUST_INLINE_CONSTANT                 static const __device__
@@ -75,3 +75,20 @@
 #  endif
 #endif
 
+#if defined(__NVCOMPILER_CUDA__)
+#  define THRUST_IS_DEVICE_CODE __builtin_is_device_code()
+#  define THRUST_IS_HOST_CODE (!__builtin_is_device_code())
+#  define THRUST_INCLUDE_DEVICE_CODE 1
+#  define THRUST_INCLUDE_HOST_CODE 1
+#elif defined(__CUDA_ARCH__)
+#  define THRUST_IS_DEVICE_CODE 1
+#  define THRUST_IS_HOST_CODE 0
+#  define THRUST_INCLUDE_DEVICE_CODE 1
+#  define THRUST_INCLUDE_HOST_CODE 0
+#else
+#  define THRUST_IS_DEVICE_CODE 0
+#  define THRUST_IS_HOST_CODE 1
+#  define THRUST_INCLUDE_DEVICE_CODE 0
+#  define THRUST_INCLUDE_HOST_CODE 1
+#endif
+
@@ -22,7 +22,11 @@
 
 #include <thrust/detail/config.h>
 
-#if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__))
+// #pragma nv_exec_check_disable is only recognized by NVCC.  Having a macro
+// expand to a #pragma (rather than _Pragma) only works with NVCC's compilation
+// model, not with other compilers.
+#if defined(__CUDACC__) && !defined(__NVCOMPILER_CUDA__) && \
+    !(defined(__CUDA__) && defined(__clang__))
 
 #define __thrust_exec_check_disable__ #pragma nv_exec_check_disable
 
 
@@ -430,15 +430,19 @@ __host__ __device__
   void contiguous_storage<T,Alloc>
     ::swap_allocators(false_type, Alloc &other)
 {
-#ifdef __CUDA_ARCH__
-  // allocators must be equal when swapping containers with allocators that propagate on swap
-  assert(!is_allocator_not_equal(other));
-#else
-  if (is_allocator_not_equal(other))
-  {
-    throw allocator_mismatch_on_swap();
+  if (THRUST_IS_DEVICE_CODE) {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      // allocators must be equal when swapping containers with allocators that propagate on swap
+      assert(!is_allocator_not_equal(other));
+    #endif
+  } else {
+    #if THRUST_INCLUDE_HOST_CODE
+      if (is_allocator_not_equal(other))
+      {
+        throw allocator_mismatch_on_swap();
+      }
+    #endif
   }
-#endif
   thrust::swap(m_allocator, other);
 } // end contiguous_storage::swap_allocators()
 
 
@@ -52,7 +52,7 @@ template<typename Eval>
   typedef Eval eval_type;
 
   __host__ __device__
-  actor(void);
+  THRUST_CONSTEXPR actor();
 
   __host__ __device__
   actor(const Eval &base);
 
@@ -38,8 +38,8 @@ namespace functional
 
 template<typename Eval>
   __host__ __device__
-  actor<Eval>
-    ::actor(void)
+  THRUST_CONSTEXPR actor<Eval>
+    ::actor()
       : eval_type()
 {}
 
 
@@ -59,7 +59,7 @@ template<unsigned int i>
     };
 
     __host__ __device__
-    argument(void){}
+    THRUST_CONSTEXPR argument(){}
 
     template<typename Env>
     __host__ __device__
 
@@ -32,22 +32,27 @@ template <typename Integer>
 __host__ __device__ __thrust_forceinline__
 Integer clz(Integer x)
 {
-#if __CUDA_ARCH__
-  return ::__clz(x);
-#else
-  int num_bits = 8 * sizeof(Integer);
-  int num_bits_minus_one = num_bits - 1;
-
-  for (int i = num_bits_minus_one; i >= 0; --i)
-  {
-    if ((Integer(1) << i) & x)
-    {
-      return num_bits_minus_one - i;
-    }
+  Integer result;
+  if (THRUST_IS_DEVICE_CODE) {
+    #if THRUST_INCLUDE_DEVICE_CODE
+      result = ::__clz(x);
+    #endif
+  } else {
+    #if THRUST_INCLUDE_HOST_CODE
+      int num_bits = 8 * sizeof(Integer);
+      int num_bits_minus_one = num_bits - 1;
+      result = num_bits;
+      for (int i = num_bits_minus_one; i >= 0; --i)
+      {
+        if ((Integer(1) << i) & x)
+        {
+          result = num_bits_minus_one - i;
+          break;
+        }
+      }
+    #endif
   }
-
-  return num_bits;
-#endif
+  return result;
 }
 
 template <typename Integer>
 
@@ -31,7 +31,7 @@ struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
     thrust::system::detail::sequential::execution_policy>
 {
   __host__ __device__
-  seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
+  THRUST_CONSTEXPR seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
 
   // allow any execution_policy to convert to seq_t
   template<typename DerivedPolicy>
@@ -45,11 +45,7 @@ struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
 } // end detail
 
 
-#ifdef __CUDA_ARCH__
-static const __device__ detail::seq_t seq;
-#else
-static const detail::seq_t seq;
-#endif
+THRUST_INLINE_CONSTANT detail::seq_t seq;
 
 
 } // end thrust
 
@@ -344,11 +344,7 @@ static const detail::host_t host;
  *  \see host_execution_policy
  *  \see thrust::device
  */
-#ifdef __CUDA_ARCH__
-static const __device__ detail::device_t device;
-#else
-static const detail::device_t device;
-#endif
+THRUST_INLINE_CONSTANT detail::device_t device;
 
 
 // define seq for the purpose of Doxygenating it