Optimized the performance of float object

nfrechette · Jun 24, 2024 · 8554eac · 8554eac
1 parent e1a80f5
commit 8554eac
Show file tree

Hide file tree

Showing 7 changed files with 355 additions and 364 deletions.
diff --git a/includes/rtm/impl/matrix_affine_common.h b/includes/rtm/impl/matrix_affine_common.h
@@ -58,18 +58,21 @@ namespace rtm
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
 
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
@@ -80,19 +83,21 @@ namespace rtm
 			RTM_DISABLE_SECURITY_COOKIE_CHECK inline RTM_SIMD_CALL operator matrix3x4() const RTM_NO_EXCEPT
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
-
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));

diff --git a/includes/rtm/impl/vector_swizzle.h b/includes/rtm/impl/vector_swizzle.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+// #include <spatial/core/Platform.hpp>
+#include "rtm/math.h"
+#include "rtm/types.h"
+#include "rtm/impl/compiler_utils.h"
+#include "rtm/scalarf.h"
+#include "rtm/scalard.h"
+#include "rtm/version.h"
+
+
+RTM_IMPL_FILE_PRAGMA_PUSH
+
+namespace rtm
+{
+	RTM_IMPL_VERSION_NAMESPACE_BEGIN
+
+#if defined(RTM_SSE2_INTRINSICS)
+
+namespace sse2_permute
+{
+#define SHUFFLEMASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) )
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float swizzle
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_swizzle_template(const vector4f& vec)
+	{
+		return _mm_shuffle_ps(vec, vec, SHUFFLEMASK(index0, index1, index2, index3));
+	}
+
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 2, 3>(const vector4f& vec) { return vec; }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 0, 1>(const vector4f& vec) { return _mm_movelh_ps(vec, vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 3, 2, 3>(const vector4f& vec) { return _mm_movehl_ps(vec, vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 1, 1>(const vector4f& vec) { return _mm_unpacklo_ps(vec, vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 2, 3, 3>(const vector4f& vec) { return _mm_unpackhi_ps(vec, vec); }
+
+#if defined(RTM_SSE4_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 2, 2>(const vector4f& vec) { return _mm_moveldup_ps(vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<1, 1, 3, 3>(const vector4f& vec) { return _mm_movehdup_ps(vec); }
+#endif
+
+#if defined(RTM_AVX2_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 0, 0>(const vector4f& vec) { return _mm_broadcastss_ps(vec); }
+#endif
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float replicate
+	template<int Index>
+	RTM_FORCE_INLINE vector4f vector_replicate_template(const vector4f& vec)
+	{
+		static_assert(Index >= 0 && Index <= 3, "Invalid Index");
+		return vector_swizzle_template<Index, Index, Index, Index>(vec);
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float shuffle
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_shuffle_template(const vector4f& vec1, const vector4f& vec2)
+	{
+		static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index");
+		return _mm_shuffle_ps(vec1, vec2, SHUFFLEMASK(index0, index1, index2, index3));
+	}
+
+	// Float Shuffle specializations
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2) { return _mm_movelh_ps(vec1, vec2); }
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2) { return _mm_movehl_ps(vec2, vec1); } // Note: movehl copies first from the 2nd argument
+
+}; // namespace sse2_permute
+
+
+#define VECTOR_REPLICATE( vec, element_index )	sse2_permute::vector_replicate_template<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w )		sse2_permute::vector_swizzle_template<x,y,z,w>( vec )
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )		sse2_permute::vector_shuffle_template<x,y,z,w>( vec1, vec2 )
+
+#elif defined(RTM_NEON_INTRINSICS) && defined(__clang__)
+//now we only support __clang__ neon here
+
+template <int x, int y, int z, int w>
+RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
+{
+	return __builtin_shufflevector(vec, vec, x, y, z, w);
+}
+
+template <int x, int y, int z, int w>
+RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
+{
+	return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4);
+}
+
+template <int element_index>
+RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+{
+	return vdupq_n_f32(vgetq_lane_f32(vec, element_index));
+}
+
+template <int element_index>
+RTM_FORCE_INLINE float64x2_t vector_replicate_impl(const float64x2_t& vec)
+{
+	return vdupq_n_f64(vgetq_lane_f64(vec, element_index));
+}
+
+#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl<x, y, z, w>(vec)
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )	vector_shuffle_impl<x, y, z, w>(vec1, vec2)
+
+#else
+#pragma error("vector swizzle not implement here!");
+#endif
+
+RTM_IMPL_VERSION_NAMESPACE_END
+}
+
+RTM_IMPL_FILE_PRAGMA_POP
+
diff --git a/includes/rtm/matrix3x3f.h b/includes/rtm/matrix3x3f.h
@@ -181,7 +181,7 @@ namespace rtm
 	// is to multiply the normal with the cofactor matrix.
 	// See: https://github.com/graphitemaster/normals_revisited
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_arg0 mtx) RTM_NO_EXCEPT
+    RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_argn mtx) RTM_NO_EXCEPT
 	{
 		vector4f tmp;
 

diff --git a/includes/rtm/matrix3x4f.h b/includes/rtm/matrix3x4f.h
@@ -61,22 +61,25 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion and translation.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 
 		const vector4f x_axis = vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F);
 		const vector4f y_axis = vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F);
@@ -87,31 +90,33 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QV transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qv(transform.rotation, transform.translation);
 	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion, translation, and scalar scale.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
-
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 		const scalarf scale_s = scalar_set(scale);
 
 		const vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_s);
@@ -123,30 +128,33 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QVS transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qvs(transform.rotation, transform.translation_scale, qvs_get_scale(transform));
 	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion, translation, and 3D scale.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 
 		const scalarf scale_x = vector_get_x_as_scalar(scale);
 		const scalarf scale_y = vector_get_y_as_scalar(scale);
@@ -161,7 +169,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QVV transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qvv(transform.rotation, transform.translation, transform.scale);
 	}
@@ -209,7 +217,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 3x4 matrix where the specified axis has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{