From a995282146b116152d9e70966546b35c68968e73 Mon Sep 17 00:00:00 2001 From: zhouqili Date: Mon, 24 Jun 2024 16:47:45 +0800 Subject: [PATCH] use template specialization to support compile-time branch judgment --- includes/rtm/impl/vector4f_swizzle.h | 149 +++++++++++++++++++++++++++ includes/rtm/impl/vector_swizzle.h | 121 ---------------------- includes/rtm/vector4f.h | 147 ++++++++++++++++++++------ 3 files changed, 262 insertions(+), 155 deletions(-) create mode 100644 includes/rtm/impl/vector4f_swizzle.h delete mode 100644 includes/rtm/impl/vector_swizzle.h diff --git a/includes/rtm/impl/vector4f_swizzle.h b/includes/rtm/impl/vector4f_swizzle.h new file mode 100644 index 0000000..5aec3b0 --- /dev/null +++ b/includes/rtm/impl/vector4f_swizzle.h @@ -0,0 +1,149 @@ +#pragma once + +#include +#include +#include + +#include "rtm/math.h" +#include "rtm/types.h" +#include "rtm/impl/compiler_utils.h" +#include "rtm/scalarf.h" +#include "rtm/scalard.h" +#include "rtm/version.h" + +RTM_IMPL_FILE_PRAGMA_PUSH + +namespace rtm +{ + RTM_IMPL_VERSION_NAMESPACE_BEGIN + +#if defined(RTM_SSE2_INTRINSICS) || defined(RTM_SSE4_INTRINSICS) || defined(RTM_AVX2_INTRINSICS) + +namespace sse_permute +{ +#define SHUFFLE_MASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) ) + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float swizzle + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template + RTM_FORCE_INLINE vector4f vector_swizzle_impl(const vector4f& vec) + { + return _mm_shuffle_ps(vec, vec, SHUFFLE_MASK(index0, index1, index2, index3)); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 2, 3>(const vector4f& vec) + { + return vec; + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 0, 1>(const vector4f& vec) + { + return _mm_movelh_ps(vec, vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 3, 2, 3>(const vector4f& vec) + { + return _mm_movehl_ps(vec, vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 1, 1>(const vector4f& vec) + { + return _mm_unpacklo_ps(vec, vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 2, 3, 3>(const vector4f& vec) + { + return _mm_unpackhi_ps(vec, vec); + } + +#if defined(RTM_SSE4_INTRINSICS) + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 2, 2>(const vector4f& vec) + { + return _mm_moveldup_ps(vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<1, 1, 3, 3>(const vector4f& vec) + { + return _mm_movehdup_ps(vec); + } +#endif + +#if defined(RTM_AVX2_INTRINSICS) + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 0, 0>(const vector4f& vec) + { + return _mm_broadcastss_ps(vec); + } +#endif + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float replicate + template + RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec) + { + static_assert(index >= 0 && index <= 3, "Invalid Index"); + return vector_swizzle_impl(vec); + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float shuffle + template + RTM_FORCE_INLINE vector4f vector_shuffle_impl(const vector4f& vec1, const vector4f& vec2) + { + static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index"); + return _mm_shuffle_ps(vec1, vec2, SHUFFLE_MASK(index0, index1, index2, index3)); + } + + // Float Shuffle specializations + template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2) + { + return _mm_movelh_ps(vec1, vec2); + + } + template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2) + { + // Note: movehl copies first from the 2nd argument + return _mm_movehl_ps(vec2, vec1); + } + +}; // namespace sse_permute + + +#define VECTOR_REPLICATE( vec, element_index ) sse_permute::vector_replicate_impl(vec) +#define VECTOR_SWIZZLE( vec, x, y, z, w ) sse_permute::vector_swizzle_impl( vec ) +#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) sse_permute::vector_shuffle_impl( vec1, vec2 ) + +#elif defined(RTM_NEON_INTRINSICS) && defined(__clang__) +//now we only support __clang__ neon here + +template +RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec) +{ + return __builtin_shufflevector(vec, vec, x, y, z, w); +} + +template +RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2) +{ + return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4); +} + +template +RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec) +{ + return vdupq_n_f32(vgetq_lane_f32(vec, element_index)); +} + +template +RTM_FORCE_INLINE float64x2_t vector_replicate_impl(const float64x2_t& vec) +{ + return vdupq_n_f64(vgetq_lane_f64(vec, element_index)); +} + +#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl(vec) +#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl(vec) +#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) vector_shuffle_impl(vec1, vec2) + +//#else +//#pragma error("vector swizzle not implement here!"); +#endif + +RTM_IMPL_VERSION_NAMESPACE_END +} + +RTM_IMPL_FILE_PRAGMA_POP + diff --git a/includes/rtm/impl/vector_swizzle.h b/includes/rtm/impl/vector_swizzle.h deleted file mode 100644 index f837108..0000000 --- a/includes/rtm/impl/vector_swizzle.h +++ /dev/null @@ -1,121 +0,0 @@ -#pragma once - -#include -#include -#include - -// #include -#include "rtm/math.h" -#include "rtm/types.h" -#include "rtm/impl/compiler_utils.h" -#include "rtm/scalarf.h" -#include "rtm/scalard.h" -#include "rtm/version.h" - - -RTM_IMPL_FILE_PRAGMA_PUSH - -namespace rtm -{ - RTM_IMPL_VERSION_NAMESPACE_BEGIN - -#if defined(RTM_SSE2_INTRINSICS) - -namespace sse2_permute -{ -#define SHUFFLEMASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) ) - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Float swizzle - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - template - RTM_FORCE_INLINE vector4f vector_swizzle_template(const vector4f& vec) - { - return _mm_shuffle_ps(vec, vec, SHUFFLEMASK(index0, index1, index2, index3)); - } - - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 2, 3>(const vector4f& vec) { return vec; } - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 0, 1>(const vector4f& vec) { return _mm_movelh_ps(vec, vec); } - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 3, 2, 3>(const vector4f& vec) { return _mm_movehl_ps(vec, vec); } - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 1, 1>(const vector4f& vec) { return _mm_unpacklo_ps(vec, vec); } - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 2, 3, 3>(const vector4f& vec) { return _mm_unpackhi_ps(vec, vec); } - -#if defined(RTM_SSE4_INTRINSICS) - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 2, 2>(const vector4f& vec) { return _mm_moveldup_ps(vec); } - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<1, 1, 3, 3>(const vector4f& vec) { return _mm_movehdup_ps(vec); } -#endif - -#if defined(RTM_AVX2_INTRINSICS) - template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 0, 0>(const vector4f& vec) { return _mm_broadcastss_ps(vec); } -#endif - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Float replicate - template - RTM_FORCE_INLINE vector4f vector_replicate_template(const vector4f& vec) - { - static_assert(Index >= 0 && Index <= 3, "Invalid Index"); - return vector_swizzle_template(vec); - } - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Float shuffle - template - RTM_FORCE_INLINE vector4f vector_shuffle_template(const vector4f& vec1, const vector4f& vec2) - { - static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index"); - return _mm_shuffle_ps(vec1, vec2, SHUFFLEMASK(index0, index1, index2, index3)); - } - - // Float Shuffle specializations - template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2) { return _mm_movelh_ps(vec1, vec2); } - template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2) { return _mm_movehl_ps(vec2, vec1); } // Note: movehl copies first from the 2nd argument - -}; // namespace sse2_permute - - -#define VECTOR_REPLICATE( vec, element_index ) sse2_permute::vector_replicate_template(vec) -#define VECTOR_SWIZZLE( vec, x, y, z, w ) sse2_permute::vector_swizzle_template( vec ) -#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) sse2_permute::vector_shuffle_template( vec1, vec2 ) - -#elif defined(RTM_NEON_INTRINSICS) && defined(__clang__) -//now we only support __clang__ neon here - -template -RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec) -{ - return __builtin_shufflevector(vec, vec, x, y, z, w); -} - -template -RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2) -{ - return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4); -} - -template -RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec) -{ - return vdupq_n_f32(vgetq_lane_f32(vec, element_index)); -} - -template -RTM_FORCE_INLINE float64x2_t vector_replicate_impl(const float64x2_t& vec) -{ - return vdupq_n_f64(vgetq_lane_f64(vec, element_index)); -} - -#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl(vec) -#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl(vec) -#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) vector_shuffle_impl(vec1, vec2) - -#else -#pragma error("vector swizzle not implement here!"); -#endif - -RTM_IMPL_VERSION_NAMESPACE_END -} - -RTM_IMPL_FILE_PRAGMA_POP - diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h index de99b78..25bfbcc 100644 --- a/includes/rtm/vector4f.h +++ b/includes/rtm/vector4f.h @@ -34,7 +34,7 @@ #include "rtm/impl/macros.mask4.impl.h" #include "rtm/impl/memory_utils.h" #include "rtm/impl/vector_common.h" -#include "rtm/impl/vector_swizzle.h" +#include "rtm/impl/vector4f_swizzle.h" #include #include @@ -3422,63 +3422,142 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Mixes two inputs and returns the desired components. // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Slow code path, not yet optimized or not using intrinsics ////////////////////////////////////////////////////////////////////////// - template - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT + template + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix_slow(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT + { + float combine_arr[8]; + vector_store(input0, combine_arr); + vector_store(input1, combine_arr + 4); + return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]); + } + + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Use compilation time to speed up branch judgment + ////////////////////////////////////////////////////////////////////////// + template = 4 && index3 >= 4), int>::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) { - constexpr int index0 = (int)comp0; - constexpr int index1 = (int)comp1; - constexpr int index2 = (int)comp2; - constexpr int index3 = (int)comp3; -#if defined(__clang__) - return __builtin_shufflevector(input0, input1, index0, index1, index2, index3); +#if defined(RTM_NO_INTRINSICS) + return vector_mix_slow(input0, input1); #else - if constexpr (index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4) { - return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4); - } - else if constexpr(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) { - //no input1 use here - return VECTOR_SWIZZLE(input0, index0, index1, index2, index3); - } - else if constexpr(index0 >=4 && index1 >=4 && index2 >=4 && index3 >=4) { - //no input0 use here - return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 -4); - }else { - - float combine_arr[8]; - vector_store(input0, combine_arr); - vector_store(input1, combine_arr + 4); - return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]); - } + return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4); +#endif + } + + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Use compilation time to speed up branch judgment + ////////////////////////////////////////////////////////////////////////// + template ::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix_slow(input0, input1); +#else + (void)input1; + return VECTOR_SWIZZLE(input0, index0, index1, index2, index3); #endif } + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Use compilation time to speed up branch judgment + ////////////////////////////////////////////////////////////////////////// + template = 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix_slow(input0, input1); +#else + (void)input0; + return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 - 4); +#endif + } + + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // If no matching specialized template function is found, fall back to the slow version + ////////////////////////////////////////////////////////////////////////// + template = 4 && index3 >= 4) && + !(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) && + !(index0 >= 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) + { + return vector_mix_slow(input0, input1); + } + + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + ////////////////////////////////////////////////////////////////////////// + template + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT + { + constexpr int index0 = (int)comp0; + constexpr int index1 = (int)comp1; + constexpr int index2 = (int)comp2; + constexpr int index3 = (int)comp3; + + return vector_swizzle_with_index(input0, input1); + } + ////////////////////////////////////////////////////////////////////////// // Replicates the [x] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT { - return VECTOR_REPLICATE(input, 0); + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 0); +#endif } ////////////////////////////////////////////////////////////////////////// // Replicates the [y] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { - return VECTOR_REPLICATE(input, 1); + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 1); +#endif } ////////////////////////////////////////////////////////////////////////// // Replicates the [z] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { - return VECTOR_REPLICATE(input, 2); + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 2); +#endif } ////////////////////////////////////////////////////////////////////////// // Replicates the [w] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { - return VECTOR_REPLICATE(input, 3); + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 3); +#endif } //////////////////////////////////////////////////////////////////////////