From 8554eacb3d4325ddaec8f3c4479838b13271236c Mon Sep 17 00:00:00 2001 From: zhouqili Date: Fri, 21 Jun 2024 12:22:41 +0800 Subject: [PATCH] Optimized the performance of float object --- includes/rtm/impl/matrix_affine_common.h | 55 ++-- includes/rtm/impl/vector_swizzle.h | 121 +++++++ includes/rtm/matrix3x3f.h | 2 +- includes/rtm/matrix3x4f.h | 96 +++--- includes/rtm/matrix4x4f.h | 38 +-- includes/rtm/quatf.h | 14 +- includes/rtm/vector4f.h | 393 ++++++++--------------- 7 files changed, 355 insertions(+), 364 deletions(-) create mode 100644 includes/rtm/impl/vector_swizzle.h diff --git a/includes/rtm/impl/matrix_affine_common.h b/includes/rtm/impl/matrix_affine_common.h index 5bef3a43..df914e5c 100644 --- a/includes/rtm/impl/matrix_affine_common.h +++ b/includes/rtm/impl/matrix_affine_common.h @@ -58,18 +58,21 @@ namespace rtm { RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized"); - const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input); - const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input); - const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input); - const float_type xx = quat_get_x(quat_input) * x2; - const float_type xy = quat_get_x(quat_input) * y2; - const float_type xz = quat_get_x(quat_input) * z2; - const float_type yy = quat_get_y(quat_input) * y2; - const float_type yz = quat_get_y(quat_input) * z2; - const float_type zz = quat_get_z(quat_input) * z2; - const float_type wx = quat_get_w(quat_input) * x2; - const float_type wy = quat_get_w(quat_input) * y2; - const float_type wz = quat_get_w(quat_input) * z2; + float_type quatval[4]; + quat_store(quat_input, quatval); + + const float_type x2 = quatval[0] + quatval[0]; + const float_type y2 = quatval[1] + quatval[1]; + const float_type z2 = quatval[2] + quatval[2]; + const float_type xx = quatval[0] * x2; + const float_type xy = quatval[0] * y2; + const float_type xz = quatval[0] * z2; + const float_type yy = quatval[1] * y2; + const float_type yz = quatval[1] * z2; + const float_type zz = quatval[2] * z2; + const float_type wx = quatval[3] * x2; + const float_type wy = quatval[3] * y2; + const float_type wz = quatval[3] * z2; const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0)); const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0)); @@ -80,19 +83,21 @@ namespace rtm RTM_DISABLE_SECURITY_COOKIE_CHECK inline RTM_SIMD_CALL operator matrix3x4() const RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized"); - - const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input); - const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input); - const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input); - const float_type xx = quat_get_x(quat_input) * x2; - const float_type xy = quat_get_x(quat_input) * y2; - const float_type xz = quat_get_x(quat_input) * z2; - const float_type yy = quat_get_y(quat_input) * y2; - const float_type yz = quat_get_y(quat_input) * z2; - const float_type zz = quat_get_z(quat_input) * z2; - const float_type wx = quat_get_w(quat_input) * x2; - const float_type wy = quat_get_w(quat_input) * y2; - const float_type wz = quat_get_w(quat_input) * z2; + float_type quatval[4]; + quat_store(quat_input, quatval); + + const float_type x2 = quatval[0] + quatval[0]; + const float_type y2 = quatval[1] + quatval[1]; + const float_type z2 = quatval[2] + quatval[2]; + const float_type xx = quatval[0] * x2; + const float_type xy = quatval[0] * y2; + const float_type xz = quatval[0] * z2; + const float_type yy = quatval[1] * y2; + const float_type yz = quatval[1] * z2; + const float_type zz = quatval[2] * z2; + const float_type wx = quatval[3] * x2; + const float_type wy = quatval[3] * y2; + const float_type wz = quatval[3] * z2; const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0)); const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0)); diff --git a/includes/rtm/impl/vector_swizzle.h b/includes/rtm/impl/vector_swizzle.h new file mode 100644 index 00000000..f837108d --- /dev/null +++ b/includes/rtm/impl/vector_swizzle.h @@ -0,0 +1,121 @@ +#pragma once + +#include +#include +#include + +// #include +#include "rtm/math.h" +#include "rtm/types.h" +#include "rtm/impl/compiler_utils.h" +#include "rtm/scalarf.h" +#include "rtm/scalard.h" +#include "rtm/version.h" + + +RTM_IMPL_FILE_PRAGMA_PUSH + +namespace rtm +{ + RTM_IMPL_VERSION_NAMESPACE_BEGIN + +#if defined(RTM_SSE2_INTRINSICS) + +namespace sse2_permute +{ +#define SHUFFLEMASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) ) + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float swizzle + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + template + RTM_FORCE_INLINE vector4f vector_swizzle_template(const vector4f& vec) + { + return _mm_shuffle_ps(vec, vec, SHUFFLEMASK(index0, index1, index2, index3)); + } + + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 2, 3>(const vector4f& vec) { return vec; } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 0, 1>(const vector4f& vec) { return _mm_movelh_ps(vec, vec); } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 3, 2, 3>(const vector4f& vec) { return _mm_movehl_ps(vec, vec); } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 1, 1>(const vector4f& vec) { return _mm_unpacklo_ps(vec, vec); } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 2, 3, 3>(const vector4f& vec) { return _mm_unpackhi_ps(vec, vec); } + +#if defined(RTM_SSE4_INTRINSICS) + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 2, 2>(const vector4f& vec) { return _mm_moveldup_ps(vec); } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<1, 1, 3, 3>(const vector4f& vec) { return _mm_movehdup_ps(vec); } +#endif + +#if defined(RTM_AVX2_INTRINSICS) + template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 0, 0>(const vector4f& vec) { return _mm_broadcastss_ps(vec); } +#endif + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float replicate + template + RTM_FORCE_INLINE vector4f vector_replicate_template(const vector4f& vec) + { + static_assert(Index >= 0 && Index <= 3, "Invalid Index"); + return vector_swizzle_template(vec); + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float shuffle + template + RTM_FORCE_INLINE vector4f vector_shuffle_template(const vector4f& vec1, const vector4f& vec2) + { + static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index"); + return _mm_shuffle_ps(vec1, vec2, SHUFFLEMASK(index0, index1, index2, index3)); + } + + // Float Shuffle specializations + template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2) { return _mm_movelh_ps(vec1, vec2); } + template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2) { return _mm_movehl_ps(vec2, vec1); } // Note: movehl copies first from the 2nd argument + +}; // namespace sse2_permute + + +#define VECTOR_REPLICATE( vec, element_index ) sse2_permute::vector_replicate_template(vec) +#define VECTOR_SWIZZLE( vec, x, y, z, w ) sse2_permute::vector_swizzle_template( vec ) +#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) sse2_permute::vector_shuffle_template( vec1, vec2 ) + +#elif defined(RTM_NEON_INTRINSICS) && defined(__clang__) +//now we only support __clang__ neon here + +template +RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec) +{ + return __builtin_shufflevector(vec, vec, x, y, z, w); +} + +template +RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2) +{ + return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4); +} + +template +RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec) +{ + return vdupq_n_f32(vgetq_lane_f32(vec, element_index)); +} + +template +RTM_FORCE_INLINE float64x2_t vector_replicate_impl(const float64x2_t& vec) +{ + return vdupq_n_f64(vgetq_lane_f64(vec, element_index)); +} + +#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl(vec) +#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl(vec) +#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) vector_shuffle_impl(vec1, vec2) + +#else +#pragma error("vector swizzle not implement here!"); +#endif + +RTM_IMPL_VERSION_NAMESPACE_END +} + +RTM_IMPL_FILE_PRAGMA_POP + diff --git a/includes/rtm/matrix3x3f.h b/includes/rtm/matrix3x3f.h index 40a90f9c..0fe31a7e 100644 --- a/includes/rtm/matrix3x3f.h +++ b/includes/rtm/matrix3x3f.h @@ -181,7 +181,7 @@ namespace rtm // is to multiply the normal with the cofactor matrix. // See: https://github.com/graphitemaster/normals_revisited ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_arg0 mtx) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_argn mtx) RTM_NO_EXCEPT { vector4f tmp; diff --git a/includes/rtm/matrix3x4f.h b/includes/rtm/matrix3x4f.h index 71f7f1b0..e7fa5963 100644 --- a/includes/rtm/matrix3x4f.h +++ b/includes/rtm/matrix3x4f.h @@ -61,22 +61,25 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Sets a 3x4 affine matrix from a rotation quaternion and translation. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized"); - const float x2 = quat_get_x(quat) + quat_get_x(quat); - const float y2 = quat_get_y(quat) + quat_get_y(quat); - const float z2 = quat_get_z(quat) + quat_get_z(quat); - const float xx = quat_get_x(quat) * x2; - const float xy = quat_get_x(quat) * y2; - const float xz = quat_get_x(quat) * z2; - const float yy = quat_get_y(quat) * y2; - const float yz = quat_get_y(quat) * z2; - const float zz = quat_get_z(quat) * z2; - const float wx = quat_get_w(quat) * x2; - const float wy = quat_get_w(quat) * y2; - const float wz = quat_get_w(quat) * z2; + float quatval[4]; + quat_store(quat, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; const vector4f x_axis = vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F); const vector4f y_axis = vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F); @@ -87,7 +90,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Converts a QV transform into a 3x4 affine matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_arg0 transform) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_argn transform) RTM_NO_EXCEPT { return matrix_from_qv(transform.rotation, transform.translation); } @@ -95,23 +98,25 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Sets a 3x4 affine matrix from a rotation quaternion, translation, and scalar scale. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized"); - const float x2 = quat_get_x(quat) + quat_get_x(quat); - const float y2 = quat_get_y(quat) + quat_get_y(quat); - const float z2 = quat_get_z(quat) + quat_get_z(quat); - const float xx = quat_get_x(quat) * x2; - const float xy = quat_get_x(quat) * y2; - const float xz = quat_get_x(quat) * z2; - const float yy = quat_get_y(quat) * y2; - const float yz = quat_get_y(quat) * z2; - const float zz = quat_get_z(quat) * z2; - const float wx = quat_get_w(quat) * x2; - const float wy = quat_get_w(quat) * y2; - const float wz = quat_get_w(quat) * z2; - + float quatval[4]; + quat_store(quat, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; const scalarf scale_s = scalar_set(scale); const vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_s); @@ -123,7 +128,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Converts a QVS transform into a 3x4 affine matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_arg0 transform) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_argn transform) RTM_NO_EXCEPT { return matrix_from_qvs(transform.rotation, transform.translation_scale, qvs_get_scale(transform)); } @@ -131,22 +136,25 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Sets a 3x4 affine matrix from a rotation quaternion, translation, and 3D scale. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized"); - const float x2 = quat_get_x(quat) + quat_get_x(quat); - const float y2 = quat_get_y(quat) + quat_get_y(quat); - const float z2 = quat_get_z(quat) + quat_get_z(quat); - const float xx = quat_get_x(quat) * x2; - const float xy = quat_get_x(quat) * y2; - const float xz = quat_get_x(quat) * z2; - const float yy = quat_get_y(quat) * y2; - const float yz = quat_get_y(quat) * z2; - const float zz = quat_get_z(quat) * z2; - const float wx = quat_get_w(quat) * x2; - const float wy = quat_get_w(quat) * y2; - const float wz = quat_get_w(quat) * z2; + float quatval[4]; + quat_store(quat, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; const scalarf scale_x = vector_get_x_as_scalar(scale); const scalarf scale_y = vector_get_y_as_scalar(scale); @@ -161,7 +169,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Converts a QVV transform into a 3x4 affine matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_arg0 transform) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_argn transform) RTM_NO_EXCEPT { return matrix_from_qvv(transform.rotation, transform.translation, transform.scale); } @@ -209,7 +217,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 3x4 matrix where the specified axis has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT { switch (axis) { diff --git a/includes/rtm/matrix4x4f.h b/includes/rtm/matrix4x4f.h index 29494ea9..6116f8b4 100644 --- a/includes/rtm/matrix4x4f.h +++ b/includes/rtm/matrix4x4f.h @@ -41,7 +41,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis pointing in the forward direction of the default coordinate system (Z+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.z_axis; } @@ -49,7 +49,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis pointing in the up direction of the default coordinate system (Y+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.y_axis; } @@ -57,7 +57,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis pointing in the cross direction of the default coordinate system (X+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.x_axis; } @@ -65,7 +65,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis holding the position of the default coordinate system (W+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.w_axis; } @@ -73,7 +73,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the desired 4x4 matrix axis. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_arg0 input, axis4 axis) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_argn input, axis4 axis) RTM_NO_EXCEPT { return axis == axis4::x ? input.x_axis : (axis == axis4::y ? input.y_axis : (axis == axis4::z ? input.z_axis : input.w_axis)); } @@ -81,7 +81,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 4x4 matrix where the specified axis has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_argn input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT { switch (axis) { @@ -96,7 +96,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the desired 4x4 matrix component from the specified axis. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_argn input, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -111,7 +111,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the desired 4x4 matrix component from the specified axis. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -126,7 +126,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -142,7 +142,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -159,7 +159,7 @@ namespace rtm // Multiplies two 4x4 matrices. // Multiplication order is as follow: local_to_world = matrix_mul(local_to_object, object_to_world) ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_arg0 lhs, matrix4x4f_arg1 rhs) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_argn lhs, matrix4x4f_argn rhs) RTM_NO_EXCEPT { vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis); tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp); @@ -192,7 +192,7 @@ namespace rtm // Multiplies a 4x4 matrix and a 4D vector. // Multiplication order is as follow: world_position = matrix_mul(local_position, local_to_world) ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_arg0 mtx) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_argn mtx) RTM_NO_EXCEPT { vector4f tmp; @@ -207,7 +207,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Transposes a 4x4 matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_argn input) RTM_NO_EXCEPT { vector4f x_axis; vector4f y_axis; @@ -222,7 +222,7 @@ namespace rtm // If the input matrix is not invertible, the result is undefined. // For a safe alternative, supply a fallback value and a threshold. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input) RTM_NO_EXCEPT { matrix4x4f input_transposed = matrix_transpose(input); @@ -316,7 +316,7 @@ namespace rtm // If the input matrix has a determinant whose absolute value is below the supplied threshold, the // fall back value is returned instead. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input, matrix4x4f_arg1 fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input, matrix4x4f_argn fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT { matrix4x4f input_transposed = matrix_transpose(input); @@ -411,7 +411,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the determinant of the input 4x4 matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_argn input) RTM_NO_EXCEPT { matrix4x4f input_transposed = matrix_transpose(input); @@ -458,7 +458,7 @@ namespace rtm // The minor is the determinant of the sub-matrix input when the specified // row and column are removed. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_arg0 input, axis4 row, axis4 column) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_argn input, axis4 row, axis4 column) RTM_NO_EXCEPT { vector4f row0; vector4f row1; @@ -522,7 +522,7 @@ namespace rtm // Returns the cofactor matrix of the input 4x4 matrix. // See: https://en.wikipedia.org/wiki/Minor_(linear_algebra)#Cofactor_expansion_of_the_determinant ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_argn input) RTM_NO_EXCEPT { const scalarf minor_xx = matrix_minor(input, axis4::x, axis4::x); const scalarf minor_xy = matrix_minor(input, axis4::x, axis4::y); @@ -558,7 +558,7 @@ namespace rtm // Returns the adjugate of the input matrix. // See: https://en.wikipedia.org/wiki/Adjugate_matrix ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT { return matrix_transpose(matrix_cofactor(input)); } diff --git a/includes/rtm/quatf.h b/includes/rtm/quatf.h index 1dcf25a7..751b2a3d 100644 --- a/includes/rtm/quatf.h +++ b/includes/rtm/quatf.h @@ -733,19 +733,7 @@ namespace rtm { RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE RTM_SIMD_CALL operator float() const RTM_NO_EXCEPT { -#if defined(RTM_SSE4_INTRINSICS) && 0 - // SSE4 dot product instruction appears slower on Zen2, is it the case elsewhere as well? - return _mm_cvtss_f32(_mm_dp_ps(lhs, rhs, 0xFF)); -#elif defined(RTM_SSE2_INTRINSICS) - __m128 x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs); - __m128 z2_w2_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, _MM_SHUFFLE(0, 0, 3, 2)); - __m128 x2z2_y2w2_0_0 = _mm_add_ps(x2_y2_z2_w2, z2_w2_0_0); - __m128 y2w2_0_0_0 = _mm_shuffle_ps(x2z2_y2w2_0_0, x2z2_y2w2_0_0, _MM_SHUFFLE(0, 0, 0, 1)); - __m128 x2y2z2w2_0_0_0 = _mm_add_ps(x2z2_y2w2_0_0, y2w2_0_0_0); - return _mm_cvtss_f32(x2y2z2w2_0_0_0); -#else - return (quat_get_x(lhs) * quat_get_x(rhs)) + (quat_get_y(lhs) * quat_get_y(rhs)) + (quat_get_z(lhs) * quat_get_z(rhs)) + (quat_get_w(lhs) * quat_get_w(rhs)); -#endif + return vector_dot((vector4f&)lhs, (vector4f&)rhs); } #if defined(RTM_SSE2_INTRINSICS) diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h index 7cda1544..de99b78f 100644 --- a/includes/rtm/vector4f.h +++ b/includes/rtm/vector4f.h @@ -34,6 +34,7 @@ #include "rtm/impl/macros.mask4.impl.h" #include "rtm/impl/memory_utils.h" #include "rtm/impl/vector_common.h" +#include "rtm/impl/vector_swizzle.h" #include #include @@ -63,6 +64,20 @@ namespace rtm #endif } + ////////////////////////////////////////////////////////////////////////// + // Loads an aligned vector4 from memory. + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_load_aligned(const float* input) RTM_NO_EXCEPT + { +#if defined(RTM_SSE2_INTRINSICS) + return _mm_load_ps(input); +#elif defined(RTM_NEON_INTRINSICS) + return vld1q_f32(input); +#else + return vector_set(input[0], input[1], input[2], input[3]); +#endif + } + ////////////////////////////////////////////////////////////////////////// // Loads an input scalar from memory into the [x] component and sets the [yzw] components to zero. ////////////////////////////////////////////////////////////////////////// @@ -1005,6 +1020,25 @@ namespace rtm { #if defined(RTM_SSE2_INTRINSICS) _mm_storeu_ps(output, input); +#elif defined(RTM_NEON_INTRINSICS) + vst1q_f32(output, input); +#else + output[0] = vector_get_x(input); + output[1] = vector_get_y(input); + output[2] = vector_get_z(input); + output[3] = vector_get_w(input); +#endif + } + + ////////////////////////////////////////////////////////////////////////// + // Writes a vector4 to aligned memory. + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store_aligned(vector4f_arg0 input, float* output) RTM_NO_EXCEPT + { +#if defined(RTM_SSE2_INTRINSICS) + _mm_store_ps(output, input); +#elif defined(RTM_NEON_INTRINSICS) + vst1q_f32(output, input); #else output[0] = vector_get_x(input); output[1] = vector_get_y(input); @@ -1026,8 +1060,15 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store2(vector4f_arg0 input, float* output) RTM_NO_EXCEPT { +#if defined(RTM_SSE2_INTRINSICS) output[0] = vector_get_x(input); output[1] = vector_get_y(input); +#elif defined(RTM_NEON_INTRINSICS) + vst1_f32(output, *(float32x2_t*)&input); +#else + output[0] = vector_get_x(input); + output[1] = vector_get_y(input); +#endif } ////////////////////////////////////////////////////////////////////////// @@ -1035,11 +1076,22 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store3(vector4f_arg0 input, float* output) RTM_NO_EXCEPT { +#if defined(RTM_SSE2_INTRINSICS) + output[0] = vector_get_x(input); + output[1] = vector_get_y(input); + output[2] = vector_get_z(input); +#elif defined(RTM_NEON_INTRINSICS) + vst1_f32(output, *(float32x2_t*)&input); + vst1q_lane_f32(((float32_t*)output) + 2, input, 2); +#else output[0] = vector_get_x(input); output[1] = vector_get_y(input); output[2] = vector_get_z(input); +#endif } + + ////////////////////////////////////////////////////////////////////////// // Writes a vector4 to unaligned memory. ////////////////////////////////////////////////////////////////////////// @@ -1510,47 +1562,6 @@ namespace rtm #endif } - ////////////////////////////////////////////////////////////////////////// - // 3D cross product: lhs x rhs - ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT - { -#if defined(RTM_SSE2_INTRINSICS) - // cross(a, b).zxy = (a * b.yzx) - (a.yzx * b) - __m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1)); - __m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1)); - __m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs)); - - // cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx - return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1)); -#elif defined(RTM_NEON_INTRINSICS) - // cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx) - float32x4_t lhs_yzwx = vextq_f32(lhs, lhs, 1); - float32x4_t rhs_wxyz = vextq_f32(rhs, rhs, 3); - - float32x4_t lhs_yzx = vsetq_lane_f32(vgetq_lane_f32(lhs, 0), lhs_yzwx, 2); - float32x4_t rhs_zxy = vsetq_lane_f32(vgetq_lane_f32(rhs, 2), rhs_wxyz, 0); - - // part_a = (a.yzx * b.zxy) - float32x4_t part_a = vmulq_f32(lhs_yzx, rhs_zxy); - - float32x4_t lhs_wxyz = vextq_f32(lhs, lhs, 3); - float32x4_t rhs_yzwx = vextq_f32(rhs, rhs, 1); - float32x4_t lhs_zxy = vsetq_lane_f32(vgetq_lane_f32(lhs, 2), lhs_wxyz, 0); - float32x4_t rhs_yzx = vsetq_lane_f32(vgetq_lane_f32(rhs, 0), rhs_yzwx, 2); - - return vmlsq_f32(part_a, lhs_zxy, rhs_yzx); -#else - // cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx) - const float lhs_x = vector_get_x(lhs); - const float lhs_y = vector_get_y(lhs); - const float lhs_z = vector_get_z(lhs); - const float rhs_x = vector_get_x(rhs); - const float rhs_y = vector_get_y(rhs); - const float rhs_z = vector_get_z(rhs); - return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x)); -#endif - } namespace rtm_impl { @@ -2424,6 +2435,48 @@ namespace rtm } #endif + //v2 - v0 * v1 + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_neg_mul_add(vector4f_arg0 v0, vector4f_arg1 v1, vector4f_arg2 v2) RTM_NO_EXCEPT + { + return vector_neg_mul_sub(v0, v1, v2); + } + + + ////////////////////////////////////////////////////////////////////////// + // 3D cross product: lhs x rhs + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT + { +#if defined(RTM_SSE2_INTRINSICS) + // cross(a, b).zxy = (a * b.yzx) - (a.yzx * b) + __m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs)); + + // cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx + return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1)); +#elif defined(RTM_NEON_INTRINSICS) + // YZX + vector4f A = VECTOR_SWIZZLE(rhs, 1, 2, 0, 3); + vector4f B = VECTOR_SWIZZLE(lhs, 1, 2, 0, 3); + // XY, YZ, ZX + A = vector_mul(A, lhs); + // XY-YX, YZ-ZY, ZX-XZ + A = vector_neg_mul_add(B, rhs, A); + // YZ-ZY, ZX-XZ, XY-YX + return VECTOR_SWIZZLE(A, 1, 2, 0, 3); +#else + // cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx) + const float lhs_x = vector_get_x(lhs); + const float lhs_y = vector_get_y(lhs); + const float lhs_z = vector_get_z(lhs); + const float rhs_x = vector_get_x(rhs); + const float rhs_y = vector_get_y(rhs); + const float rhs_z = vector_get_z(rhs); + return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x)); +#endif + } + ////////////////////////////////////////////////////////////////////////// // Per component linear interpolation of the two inputs at the specified alpha. // The formula used is: ((1.0 - alpha) * start) + (alpha * end). @@ -3373,244 +3426,60 @@ namespace rtm template RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT { -#if defined(RTM_SSE4_INTRINSICS) - // Each component comes from the respective position of input 0 or input 1 - if (rtm_impl::static_condition<(comp0 == mix4::a || comp0 == mix4::x) && (comp1 == mix4::b || comp1 == mix4::y) && - (comp2 == mix4::c || comp2 == mix4::z) && (comp3 == mix4::d || comp3 == mix4::w)>::test()) - { - constexpr int mask = (comp0 == mix4::a ? 1 : 0) | (comp1 == mix4::b ? 2 : 0) | - (comp2 == mix4::c ? 4 : 0) | (comp3 == mix4::d ? 8 : 0); - return _mm_blend_ps(input0, input1, mask); - } - - // First component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, (int(comp0) % 4) << 6); - - // Second component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, ((int(comp1) % 4) << 6) | (1 << 4)); - - // Third component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, ((int(comp2) % 4) << 6) | (2 << 4)); - - // Fourth component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, ((int(comp3) % 4) << 6) | (3 << 4)); - - // First component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, (int(comp0) % 4) << 6); - - // Second component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, ((int(comp1) % 4) << 6) | (1 << 4)); - - // Third component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, ((int(comp2) % 4) << 6) | (2 << 4)); - - // Fourth component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, ((int(comp3) % 4) << 6) | (3 << 4)); -#endif // defined(RTM_SSE4_INTRINSICS) - -#if defined(RTM_SSE2_INTRINSICS) - // All four components come from input 0 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input0, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); - - // All four components come from input 1 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input1, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); - - // First two components come from input 0, second two come from input 1 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input0, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); - - // First two components come from input 1, second two come from input 0 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input1, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); - - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpacklo_ps(input0, input1); - - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpacklo_ps(input1, input0); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpackhi_ps(input0, input1); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpackhi_ps(input1, input0); -#endif // defined(RTM_SSE2_INTRINSICS) - -#if defined(RTM_NEON64_INTRINSICS) - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip1q_f32(input0, input1); - - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip1q_f32(input1, input0); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip2q_f32(input0, input1); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip2q_f32(input1, input0); - - // Even-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp1q_f32(input0, input1); - - // Even-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp1q_f32(input1, input0); - - // Odd-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp2q_f32(input0, input1); - - // Odd-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp2q_f32(input1, input0); - - // Even-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn1q_f32(input0, input1); - - // Even-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn1q_f32(input1, input0); - - // Odd-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn2q_f32(input0, input1); - - // Odd-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn2q_f32(input1, input0); -#endif // defined(RTM_NEON64_INTRINSICS) - -#if defined(RTM_NEON_INTRINSICS) - // The highest vector elements from input 0 and the lowest vector elements from input 1, consecutively - if (rtm_impl::static_condition::test()) - return vextq_f32(input0, input1, int(comp0) % 4); - - // The highest vector elements from input 1 and the lowest vector elements from input 0, consecutively - if (rtm_impl::static_condition::test()) - return vextq_f32(input1, input0, int(comp0) % 4); - - // All four components come from input 0, reversed order in each doubleword - if (rtm_impl::static_condition::test()) - return vrev64q_f32(input0); - - // All four components come from input 1, reversed order in each doubleword - if (rtm_impl::static_condition::test()) - return vrev64q_f32(input1); - - // First component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp0) % 4), input0, 0); - - // Second component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp1) % 4), input0, 1); - - // Third component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp2) % 4), input0, 2); - - // Fourth component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp3) % 4), input0, 3); - - // First component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp0) % 4), input1, 0); - - // Second component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp1) % 4), input1, 1); - - // Third component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp2) % 4), input1, 2); - - // Fourth component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp3) % 4), input1, 3); - - - // All comes from the same position - if (rtm_impl::static_condition::test()) { - // All comes from the same position of input0 - if (rtm_impl::static_condition::test()) - return vmovq_n_f32(vgetq_lane_f32(input0, int(comp0) % 4)); - // All comes from the same position of input1 - if (rtm_impl::static_condition::test()) - return vmovq_n_f32(vgetq_lane_f32(input1, int(comp0) % 4)); - } -#endif // defined(RTM_NEON_INTRINSICS) - - // Slow code path, not yet optimized or not using intrinsics - constexpr component4 component0 = rtm_impl::mix_to_component(comp0); - constexpr component4 component1 = rtm_impl::mix_to_component(comp1); - constexpr component4 component2 = rtm_impl::mix_to_component(comp2); - constexpr component4 component3 = rtm_impl::mix_to_component(comp3); - - const float x0 = vector_get_component(input0, component0); - const float x1 = vector_get_component(input1, component0); - const float x = rtm_impl::is_mix_xyzw(comp0) ? x0 : x1; - - const float y0 = vector_get_component(input0, component1); - const float y1 = vector_get_component(input1, component1); - const float y = rtm_impl::is_mix_xyzw(comp1) ? y0 : y1; - - const float z0 = vector_get_component(input0, component2); - const float z1 = vector_get_component(input1, component2); - const float z = rtm_impl::is_mix_xyzw(comp2) ? z0 : z1; - - const float w0 = vector_get_component(input0, component3); - const float w1 = vector_get_component(input1, component3); - const float w = rtm_impl::is_mix_xyzw(comp3) ? w0 : w1; - - return vector_set(x, y, z, w); + constexpr int index0 = (int)comp0; + constexpr int index1 = (int)comp1; + constexpr int index2 = (int)comp2; + constexpr int index3 = (int)comp3; +#if defined(__clang__) + return __builtin_shufflevector(input0, input1, index0, index1, index2, index3); +#else + if constexpr (index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4) { + return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4); + } + else if constexpr(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) { + //no input1 use here + return VECTOR_SWIZZLE(input0, index0, index1, index2, index3); + } + else if constexpr(index0 >=4 && index1 >=4 && index2 >=4 && index3 >=4) { + //no input0 use here + return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 -4); + }else { + + float combine_arr[8]; + vector_store(input0, combine_arr); + vector_store(input1, combine_arr + 4); + return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]); + } +#endif } ////////////////////////////////////////////////////////////////////////// // Replicates the [x] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT { + return VECTOR_REPLICATE(input, 0); + } ////////////////////////////////////////////////////////////////////////// // Replicates the [y] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { + return VECTOR_REPLICATE(input, 1); + } ////////////////////////////////////////////////////////////////////////// // Replicates the [z] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { + return VECTOR_REPLICATE(input, 2); + } ////////////////////////////////////////////////////////////////////////// // Replicates the [w] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { + return VECTOR_REPLICATE(input, 3); + } ////////////////////////////////////////////////////////////////////////// // Logical