From a995282146b116152d9e70966546b35c68968e73 Mon Sep 17 00:00:00 2001
From: zhouqili <zhouqili@bytedance.com>
Date: Mon, 24 Jun 2024 16:47:45 +0800
Subject: [PATCH] use template specialization to support compile-time branch
 judgment

---
 includes/rtm/impl/vector4f_swizzle.h | 149 +++++++++++++++++++++++++++
 includes/rtm/impl/vector_swizzle.h   | 121 ----------------------
 includes/rtm/vector4f.h              | 147 ++++++++++++++++++++------
 3 files changed, 262 insertions(+), 155 deletions(-)
 create mode 100644 includes/rtm/impl/vector4f_swizzle.h
 delete mode 100644 includes/rtm/impl/vector_swizzle.h
diff --git a/includes/rtm/impl/vector4f_swizzle.h b/includes/rtm/impl/vector4f_swizzle.h
new file mode 100644
index 0000000..5aec3b0
--- /dev/null
+++ b/includes/rtm/impl/vector4f_swizzle.h
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "rtm/math.h"
+#include "rtm/types.h"
+#include "rtm/impl/compiler_utils.h"
+#include "rtm/scalarf.h"
+#include "rtm/scalard.h"
+#include "rtm/version.h"
+
+RTM_IMPL_FILE_PRAGMA_PUSH
+
+namespace rtm
+{
+	RTM_IMPL_VERSION_NAMESPACE_BEGIN
+
+#if defined(RTM_SSE2_INTRINSICS) || defined(RTM_SSE4_INTRINSICS) || defined(RTM_AVX2_INTRINSICS)
+
+namespace sse_permute
+{
+#define SHUFFLE_MASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) )
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float swizzle
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_swizzle_impl(const vector4f& vec)
+	{
+		return _mm_shuffle_ps(vec, vec, SHUFFLE_MASK(index0, index1, index2, index3));
+	}
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 2, 3>(const vector4f& vec)
+    {
+        return vec;
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 0, 1>(const vector4f& vec)
+    {
+        return _mm_movelh_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 3, 2, 3>(const vector4f& vec)
+    {
+        return _mm_movehl_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 1, 1>(const vector4f& vec)
+    {
+        return _mm_unpacklo_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 2, 3, 3>(const vector4f& vec)
+    {
+        return _mm_unpackhi_ps(vec, vec);
+    }
+
+#if defined(RTM_SSE4_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 2, 2>(const vector4f& vec)
+    {
+        return _mm_moveldup_ps(vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<1, 1, 3, 3>(const vector4f& vec)
+    {
+        return _mm_movehdup_ps(vec);
+    }
+#endif
+
+#if defined(RTM_AVX2_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 0, 0>(const vector4f& vec)
+    {
+        return _mm_broadcastss_ps(vec);
+    }
+#endif
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float replicate
+	template<int index>
+	RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+	{
+		static_assert(index >= 0 && index <= 3, "Invalid Index");
+		return vector_swizzle_impl<index, index, index, index>(vec);
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float shuffle
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_shuffle_impl(const vector4f& vec1, const vector4f& vec2)
+	{
+		static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index");
+		return _mm_shuffle_ps(vec1, vec2, SHUFFLE_MASK(index0, index1, index2, index3));
+	}
+
+	// Float Shuffle specializations
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2)
+    {
+        return _mm_movelh_ps(vec1, vec2);
+
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2)
+    {
+        // Note: movehl copies first from the 2nd argument
+        return _mm_movehl_ps(vec2, vec1);
+    }
+
+}; // namespace sse_permute
+
+
+#define VECTOR_REPLICATE( vec, element_index )	sse_permute::vector_replicate_impl<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w )		sse_permute::vector_swizzle_impl<x,y,z,w>( vec )
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )		sse_permute::vector_shuffle_impl<x,y,z,w>( vec1, vec2 )
+
+#elif defined(RTM_NEON_INTRINSICS) && defined(__clang__)
+//now we only support __clang__ neon here
+
+template <int x, int y, int z, int w>
+RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
+{
+	return __builtin_shufflevector(vec, vec, x, y, z, w);
+}
+
+template <int x, int y, int z, int w>
+RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
+{
+	return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4);
+}
+
+template <int element_index>
+RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+{
+	return vdupq_n_f32(vgetq_lane_f32(vec, element_index));
+}
+
+template <int element_index>
+RTM_FORCE_INLINE float64x2_t vector_replicate_impl(const float64x2_t& vec)
+{
+	return vdupq_n_f64(vgetq_lane_f64(vec, element_index));
+}
+
+#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl<x, y, z, w>(vec)
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )	vector_shuffle_impl<x, y, z, w>(vec1, vec2)
+
+//#else
+//#pragma error("vector swizzle not implement here!");
+#endif
+
+RTM_IMPL_VERSION_NAMESPACE_END
+}
+
+RTM_IMPL_FILE_PRAGMA_POP
+
diff --git a/includes/rtm/impl/vector_swizzle.h b/includes/rtm/impl/vector_swizzle.h
deleted file mode 100644
index f837108..0000000
--- a/includes/rtm/impl/vector_swizzle.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <type_traits>
-
-// #include <spatial/core/Platform.hpp>
-#include "rtm/math.h"
-#include "rtm/types.h"
-#include "rtm/impl/compiler_utils.h"
-#include "rtm/scalarf.h"
-#include "rtm/scalard.h"
-#include "rtm/version.h"
-
-
-RTM_IMPL_FILE_PRAGMA_PUSH
-
-namespace rtm
-{
-	RTM_IMPL_VERSION_NAMESPACE_BEGIN
-
-#if defined(RTM_SSE2_INTRINSICS)
-
-namespace sse2_permute
-{
-#define SHUFFLEMASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) )
-
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	// Float swizzle
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-	template<int index0, int index1, int index2, int index3>
-	RTM_FORCE_INLINE vector4f vector_swizzle_template(const vector4f& vec)
-	{
-		return _mm_shuffle_ps(vec, vec, SHUFFLEMASK(index0, index1, index2, index3));
-	}
-
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 2, 3>(const vector4f& vec) { return vec; }
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 0, 1>(const vector4f& vec) { return _mm_movelh_ps(vec, vec); }
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 3, 2, 3>(const vector4f& vec) { return _mm_movehl_ps(vec, vec); }
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 1, 1>(const vector4f& vec) { return _mm_unpacklo_ps(vec, vec); }
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 2, 3, 3>(const vector4f& vec) { return _mm_unpackhi_ps(vec, vec); }
-
-#if defined(RTM_SSE4_INTRINSICS)
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 2, 2>(const vector4f& vec) { return _mm_moveldup_ps(vec); }
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<1, 1, 3, 3>(const vector4f& vec) { return _mm_movehdup_ps(vec); }
-#endif
-
-#if defined(RTM_AVX2_INTRINSICS)
-	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 0, 0>(const vector4f& vec) { return _mm_broadcastss_ps(vec); }
-#endif
-
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	// Float replicate
-	template<int Index>
-	RTM_FORCE_INLINE vector4f vector_replicate_template(const vector4f& vec)
-	{
-		static_assert(Index >= 0 && Index <= 3, "Invalid Index");
-		return vector_swizzle_template<Index, Index, Index, Index>(vec);
-	}
-
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	// Float shuffle
-	template<int index0, int index1, int index2, int index3>
-	RTM_FORCE_INLINE vector4f vector_shuffle_template(const vector4f& vec1, const vector4f& vec2)
-	{
-		static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index");
-		return _mm_shuffle_ps(vec1, vec2, SHUFFLEMASK(index0, index1, index2, index3));
-	}
-
-	// Float Shuffle specializations
-	template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2) { return _mm_movelh_ps(vec1, vec2); }
-	template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2) { return _mm_movehl_ps(vec2, vec1); } // Note: movehl copies first from the 2nd argument
-
-}; // namespace sse2_permute
-
-
-#define VECTOR_REPLICATE( vec, element_index )	sse2_permute::vector_replicate_template<element_index>(vec)
-#define VECTOR_SWIZZLE( vec, x, y, z, w )		sse2_permute::vector_swizzle_template<x,y,z,w>( vec )
-#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )		sse2_permute::vector_shuffle_template<x,y,z,w>( vec1, vec2 )
-
-#elif defined(RTM_NEON_INTRINSICS) && defined(__clang__)
-//now we only support __clang__ neon here
-
-template <int x, int y, int z, int w>
-RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
-{
-	return __builtin_shufflevector(vec, vec, x, y, z, w);
-}
-
-template <int x, int y, int z, int w>
-RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
-{
-	return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4);
-}
-
-template <int element_index>
-RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
-{
-	return vdupq_n_f32(vgetq_lane_f32(vec, element_index));
-}
-
-template <int element_index>
-RTM_FORCE_INLINE float64x2_t vector_replicate_impl(const float64x2_t& vec)
-{
-	return vdupq_n_f64(vgetq_lane_f64(vec, element_index));
-}
-
-#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl<element_index>(vec)
-#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl<x, y, z, w>(vec)
-#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )	vector_shuffle_impl<x, y, z, w>(vec1, vec2)
-
-#else
-#pragma error("vector swizzle not implement here!");
-#endif
-
-RTM_IMPL_VERSION_NAMESPACE_END
-}
-
-RTM_IMPL_FILE_PRAGMA_POP
-
diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h
index de99b78..25bfbcc 100644
--- a/includes/rtm/vector4f.h
+++ b/includes/rtm/vector4f.h
@@ -34,7 +34,7 @@
 #include "rtm/impl/macros.mask4.impl.h"
 #include "rtm/impl/memory_utils.h"
 #include "rtm/impl/vector_common.h"
-#include "rtm/impl/vector_swizzle.h"
+#include "rtm/impl/vector4f_swizzle.h"
 
 #include <cstring>
 #include <limits>
@@ -3422,63 +3422,142 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Mixes two inputs and returns the desired components.
 	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Slow code path, not yet optimized or not using intrinsics
 	//////////////////////////////////////////////////////////////////////////
-	template<mix4 comp0, mix4 comp1, mix4 comp2, mix4 comp3>
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
+    template <int index0, int index1, int index2, int index3>
+    RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix_slow(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
+    {
+        float combine_arr[8];
+        vector_store(input0, combine_arr);
+        vector_store(input1, combine_arr + 4);
+        return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]);
+    }
+
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Use compilation time to speed up branch judgment
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<(index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
 	{
-		constexpr int index0 = (int)comp0;
-		constexpr int index1 = (int)comp1;
-		constexpr int index2 = (int)comp2;
-		constexpr int index3 = (int)comp3;
-#if defined(__clang__)
-		return __builtin_shufflevector(input0, input1, index0, index1, index2, index3);
+#if defined(RTM_NO_INTRINSICS)
+		return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
 #else
-		if constexpr (index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4) {
-			return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4);
-		}
-		else if constexpr(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) {
-			//no input1 use here
-			return VECTOR_SWIZZLE(input0, index0, index1, index2, index3);
-		}
-		else if constexpr(index0 >=4 && index1 >=4 && index2 >=4 && index3 >=4) {
-			//no input0 use here
-			return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 -4);
-		}else {
-
-			float combine_arr[8];
-			vector_store(input0, combine_arr);
-			vector_store(input1, combine_arr + 4);
-			return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]);
-		}
+		return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4);
+#endif
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Use compilation time to speed up branch judgment
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
+	{
+#if defined(RTM_NO_INTRINSICS)
+		return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
+#else
+		(void)input1;
+		return VECTOR_SWIZZLE(input0, index0, index1, index2, index3);
 #endif
 	}
 
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Use compilation time to speed up branch judgment
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<(index0 >= 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
+	{
+#if defined(RTM_NO_INTRINSICS)
+		return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
+#else
+		(void)input0;
+		return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 - 4);
+#endif
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// If no matching specialized template function is found, fall back to the slow version
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<!(index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4) &&
+			!(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) &&
+			!(index0 >= 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
+    {
+        return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	//////////////////////////////////////////////////////////////////////////
+	template<mix4 comp0, mix4 comp1, mix4 comp2, mix4 comp3>
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
+    {
+        constexpr int index0 = (int)comp0;
+        constexpr int index1 = (int)comp1;
+        constexpr int index2 = (int)comp2;
+        constexpr int index3 = (int)comp3;
+
+        return vector_swizzle_with_index<index0, index1, index2, index3>(input0, input1);
+	}
+
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [x] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT {
-		return VECTOR_REPLICATE(input, 0);
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::x, mix4::x, mix4::x, mix4::x>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 0);
+#endif
 	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [y] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { 
-		return VECTOR_REPLICATE(input, 1);
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::y, mix4::y, mix4::y, mix4::y>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 1);
+#endif
 	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [z] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { 
-		return VECTOR_REPLICATE(input, 2);
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::z, mix4::z, mix4::z, mix4::z>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 2);
+#endif
 	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [w] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { 
-		return VECTOR_REPLICATE(input, 3);
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::w, mix4::w, mix4::w, mix4::w>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 3);
+#endif
 	}
 
 	//////////////////////////////////////////////////////////////////////////