From 8554eacb3d4325ddaec8f3c4479838b13271236c Mon Sep 17 00:00:00 2001
From: zhouqili <zhouqili@bytedance.com>
Date: Fri, 21 Jun 2024 12:22:41 +0800
Subject: [PATCH] Optimized the performance of float object

---
 includes/rtm/impl/matrix_affine_common.h |  55 ++--
 includes/rtm/impl/vector_swizzle.h       | 121 +++++++
 includes/rtm/matrix3x3f.h                |   2 +-
 includes/rtm/matrix3x4f.h                |  96 +++---
 includes/rtm/matrix4x4f.h                |  38 +--
 includes/rtm/quatf.h                     |  14 +-
 includes/rtm/vector4f.h                  | 393 ++++++++---------------
 7 files changed, 355 insertions(+), 364 deletions(-)
 create mode 100644 includes/rtm/impl/vector_swizzle.h

diff --git a/includes/rtm/impl/matrix_affine_common.h b/includes/rtm/impl/matrix_affine_common.h
index 5bef3a43..df914e5c 100644
--- a/includes/rtm/impl/matrix_affine_common.h
+++ b/includes/rtm/impl/matrix_affine_common.h
@@ -58,18 +58,21 @@ namespace rtm
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
 
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
@@ -80,19 +83,21 @@ namespace rtm
 			RTM_DISABLE_SECURITY_COOKIE_CHECK inline RTM_SIMD_CALL operator matrix3x4() const RTM_NO_EXCEPT
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
-
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+				
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
diff --git a/includes/rtm/impl/vector_swizzle.h b/includes/rtm/impl/vector_swizzle.h
new file mode 100644
index 00000000..f837108d
--- /dev/null
+++ b/includes/rtm/impl/vector_swizzle.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+// #include <spatial/core/Platform.hpp>
+#include "rtm/math.h"
+#include "rtm/types.h"
+#include "rtm/impl/compiler_utils.h"
+#include "rtm/scalarf.h"
+#include "rtm/scalard.h"
+#include "rtm/version.h"
+
+
+RTM_IMPL_FILE_PRAGMA_PUSH
+
+namespace rtm
+{
+	RTM_IMPL_VERSION_NAMESPACE_BEGIN
+
+#if defined(RTM_SSE2_INTRINSICS)
+
+namespace sse2_permute
+{
+#define SHUFFLEMASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) )
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float swizzle
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_swizzle_template(const vector4f& vec)
+	{
+		return _mm_shuffle_ps(vec, vec, SHUFFLEMASK(index0, index1, index2, index3));
+	}
+
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 2, 3>(const vector4f& vec) { return vec; }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 1, 0, 1>(const vector4f& vec) { return _mm_movelh_ps(vec, vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 3, 2, 3>(const vector4f& vec) { return _mm_movehl_ps(vec, vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 1, 1>(const vector4f& vec) { return _mm_unpacklo_ps(vec, vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<2, 2, 3, 3>(const vector4f& vec) { return _mm_unpackhi_ps(vec, vec); }
+
+#if defined(RTM_SSE4_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 2, 2>(const vector4f& vec) { return _mm_moveldup_ps(vec); }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<1, 1, 3, 3>(const vector4f& vec) { return _mm_movehdup_ps(vec); }
+#endif
+
+#if defined(RTM_AVX2_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_template<0, 0, 0, 0>(const vector4f& vec) { return _mm_broadcastss_ps(vec); }
+#endif
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float replicate
+	template<int Index>
+	RTM_FORCE_INLINE vector4f vector_replicate_template(const vector4f& vec)
+	{
+		static_assert(Index >= 0 && Index <= 3, "Invalid Index");
+		return vector_swizzle_template<Index, Index, Index, Index>(vec);
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float shuffle
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_shuffle_template(const vector4f& vec1, const vector4f& vec2)
+	{
+		static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index");
+		return _mm_shuffle_ps(vec1, vec2, SHUFFLEMASK(index0, index1, index2, index3));
+	}
+
+	// Float Shuffle specializations
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2) { return _mm_movelh_ps(vec1, vec2); }
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_template<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2) { return _mm_movehl_ps(vec2, vec1); } // Note: movehl copies first from the 2nd argument
+
+}; // namespace sse2_permute
+
+
+#define VECTOR_REPLICATE( vec, element_index )	sse2_permute::vector_replicate_template<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w )		sse2_permute::vector_swizzle_template<x,y,z,w>( vec )
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )		sse2_permute::vector_shuffle_template<x,y,z,w>( vec1, vec2 )
+
+#elif defined(RTM_NEON_INTRINSICS) && defined(__clang__)
+//now we only support __clang__ neon here
+
+template <int x, int y, int z, int w>
+RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
+{
+	return __builtin_shufflevector(vec, vec, x, y, z, w);
+}
+
+template <int x, int y, int z, int w>
+RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
+{
+	return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4);
+}
+
+template <int element_index>
+RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+{
+	return vdupq_n_f32(vgetq_lane_f32(vec, element_index));
+}
+
+template <int element_index>
+RTM_FORCE_INLINE float64x2_t vector_replicate_impl(const float64x2_t& vec)
+{
+	return vdupq_n_f64(vgetq_lane_f64(vec, element_index));
+}
+
+#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl<x, y, z, w>(vec)
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )	vector_shuffle_impl<x, y, z, w>(vec1, vec2)
+
+#else
+#pragma error("vector swizzle not implement here!");
+#endif
+
+RTM_IMPL_VERSION_NAMESPACE_END
+}
+
+RTM_IMPL_FILE_PRAGMA_POP
+
diff --git a/includes/rtm/matrix3x3f.h b/includes/rtm/matrix3x3f.h
index 40a90f9c..0fe31a7e 100644
--- a/includes/rtm/matrix3x3f.h
+++ b/includes/rtm/matrix3x3f.h
@@ -181,7 +181,7 @@ namespace rtm
 	// is to multiply the normal with the cofactor matrix.
 	// See: https://github.com/graphitemaster/normals_revisited
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_arg0 mtx) RTM_NO_EXCEPT
+    RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_argn mtx) RTM_NO_EXCEPT
 	{
 		vector4f tmp;
 
diff --git a/includes/rtm/matrix3x4f.h b/includes/rtm/matrix3x4f.h
index 71f7f1b0..e7fa5963 100644
--- a/includes/rtm/matrix3x4f.h
+++ b/includes/rtm/matrix3x4f.h
@@ -61,22 +61,25 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion and translation.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 
 		const vector4f x_axis = vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F);
 		const vector4f y_axis = vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F);
@@ -87,7 +90,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QV transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qv(transform.rotation, transform.translation);
 	}
@@ -95,23 +98,25 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion, translation, and scalar scale.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
-
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 		const scalarf scale_s = scalar_set(scale);
 
 		const vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_s);
@@ -123,7 +128,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QVS transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qvs(transform.rotation, transform.translation_scale, qvs_get_scale(transform));
 	}
@@ -131,22 +136,25 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion, translation, and 3D scale.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 
 		const scalarf scale_x = vector_get_x_as_scalar(scale);
 		const scalarf scale_y = vector_get_y_as_scalar(scale);
@@ -161,7 +169,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QVV transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qvv(transform.rotation, transform.translation, transform.scale);
 	}
@@ -209,7 +217,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 3x4 matrix where the specified axis has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
diff --git a/includes/rtm/matrix4x4f.h b/includes/rtm/matrix4x4f.h
index 29494ea9..6116f8b4 100644
--- a/includes/rtm/matrix4x4f.h
+++ b/includes/rtm/matrix4x4f.h
@@ -41,7 +41,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis pointing in the forward direction of the default coordinate system (Z+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.z_axis;
 	}
@@ -49,7 +49,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis pointing in the up direction of the default coordinate system (Y+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.y_axis;
 	}
@@ -57,7 +57,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis pointing in the cross direction of the default coordinate system (X+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.x_axis;
 	}
@@ -65,7 +65,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis holding the position of the default coordinate system (W+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.w_axis;
 	}
@@ -73,7 +73,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the desired 4x4 matrix axis.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_arg0 input, axis4 axis) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_argn input, axis4 axis) RTM_NO_EXCEPT
 	{
 		return axis == axis4::x ? input.x_axis : (axis == axis4::y ? input.y_axis : (axis == axis4::z ? input.z_axis : input.w_axis));
 	}
@@ -81,7 +81,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 4x4 matrix where the specified axis has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_argn input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -96,7 +96,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the desired 4x4 matrix component from the specified axis.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_argn input, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -111,7 +111,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the desired 4x4 matrix component from the specified axis.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -126,7 +126,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -142,7 +142,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -159,7 +159,7 @@ namespace rtm
 	// Multiplies two 4x4 matrices.
 	// Multiplication order is as follow: local_to_world = matrix_mul(local_to_object, object_to_world)
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_arg0 lhs, matrix4x4f_arg1 rhs) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_argn lhs, matrix4x4f_argn rhs) RTM_NO_EXCEPT
 	{
 		vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis);
 		tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp);
@@ -192,7 +192,7 @@ namespace rtm
 	// Multiplies a 4x4 matrix and a 4D vector.
 	// Multiplication order is as follow: world_position = matrix_mul(local_position, local_to_world)
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_arg0 mtx) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_argn mtx) RTM_NO_EXCEPT
 	{
 		vector4f tmp;
 
@@ -207,7 +207,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Transposes a 4x4 matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		vector4f x_axis;
 		vector4f y_axis;
@@ -222,7 +222,7 @@ namespace rtm
 	// If the input matrix is not invertible, the result is undefined.
 	// For a safe alternative, supply a fallback value and a threshold.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		matrix4x4f input_transposed = matrix_transpose(input);
 
@@ -316,7 +316,7 @@ namespace rtm
 	// If the input matrix has a determinant whose absolute value is below the supplied threshold, the
 	// fall back value is returned instead.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input, matrix4x4f_arg1 fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input, matrix4x4f_argn fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT
 	{
 		matrix4x4f input_transposed = matrix_transpose(input);
 
@@ -411,7 +411,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the determinant of the input 4x4 matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		matrix4x4f input_transposed = matrix_transpose(input);
 
@@ -458,7 +458,7 @@ namespace rtm
 	// The minor is the determinant of the sub-matrix input when the specified
 	// row and column are removed.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_arg0 input, axis4 row, axis4 column) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_argn input, axis4 row, axis4 column) RTM_NO_EXCEPT
 	{
 		vector4f row0;
 		vector4f row1;
@@ -522,7 +522,7 @@ namespace rtm
 	// Returns the cofactor matrix of the input 4x4 matrix.
 	// See: https://en.wikipedia.org/wiki/Minor_(linear_algebra)#Cofactor_expansion_of_the_determinant
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		const scalarf minor_xx = matrix_minor(input, axis4::x, axis4::x);
 		const scalarf minor_xy = matrix_minor(input, axis4::x, axis4::y);
@@ -558,7 +558,7 @@ namespace rtm
 	// Returns the adjugate of the input matrix.
 	// See: https://en.wikipedia.org/wiki/Adjugate_matrix
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT
 	{
 		return matrix_transpose(matrix_cofactor(input));
 	}
diff --git a/includes/rtm/quatf.h b/includes/rtm/quatf.h
index 1dcf25a7..751b2a3d 100644
--- a/includes/rtm/quatf.h
+++ b/includes/rtm/quatf.h
@@ -733,19 +733,7 @@ namespace rtm
 		{
 			RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE RTM_SIMD_CALL operator float() const RTM_NO_EXCEPT
 			{
-#if defined(RTM_SSE4_INTRINSICS) && 0
-				// SSE4 dot product instruction appears slower on Zen2, is it the case elsewhere as well?
-				return _mm_cvtss_f32(_mm_dp_ps(lhs, rhs, 0xFF));
-#elif defined(RTM_SSE2_INTRINSICS)
-				__m128 x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
-				__m128 z2_w2_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, _MM_SHUFFLE(0, 0, 3, 2));
-				__m128 x2z2_y2w2_0_0 = _mm_add_ps(x2_y2_z2_w2, z2_w2_0_0);
-				__m128 y2w2_0_0_0 = _mm_shuffle_ps(x2z2_y2w2_0_0, x2z2_y2w2_0_0, _MM_SHUFFLE(0, 0, 0, 1));
-				__m128 x2y2z2w2_0_0_0 = _mm_add_ps(x2z2_y2w2_0_0, y2w2_0_0_0);
-				return _mm_cvtss_f32(x2y2z2w2_0_0_0);
-#else
-				return (quat_get_x(lhs) * quat_get_x(rhs)) + (quat_get_y(lhs) * quat_get_y(rhs)) + (quat_get_z(lhs) * quat_get_z(rhs)) + (quat_get_w(lhs) * quat_get_w(rhs));
-#endif
+			    return vector_dot((vector4f&)lhs, (vector4f&)rhs);
 			}
 
 #if defined(RTM_SSE2_INTRINSICS)
diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h
index 7cda1544..de99b78f 100644
--- a/includes/rtm/vector4f.h
+++ b/includes/rtm/vector4f.h
@@ -34,6 +34,7 @@
 #include "rtm/impl/macros.mask4.impl.h"
 #include "rtm/impl/memory_utils.h"
 #include "rtm/impl/vector_common.h"
+#include "rtm/impl/vector_swizzle.h"
 
 #include <cstring>
 #include <limits>
@@ -63,6 +64,20 @@ namespace rtm
 #endif
 	}
 
+	//////////////////////////////////////////////////////////////////////////
+	// Loads an aligned vector4 from memory.
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_load_aligned(const float* input) RTM_NO_EXCEPT
+	{
+#if defined(RTM_SSE2_INTRINSICS)
+		return _mm_load_ps(input);
+#elif defined(RTM_NEON_INTRINSICS)
+		return vld1q_f32(input);
+#else
+		return vector_set(input[0], input[1], input[2], input[3]);
+#endif
+	}
+
 	//////////////////////////////////////////////////////////////////////////
 	// Loads an input scalar from memory into the [x] component and sets the [yzw] components to zero.
 	//////////////////////////////////////////////////////////////////////////
@@ -1005,6 +1020,25 @@ namespace rtm
 	{
 #if defined(RTM_SSE2_INTRINSICS)
 		_mm_storeu_ps(output, input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1q_f32(output, input);
+#else
+		output[0] = vector_get_x(input);
+		output[1] = vector_get_y(input);
+		output[2] = vector_get_z(input);
+		output[3] = vector_get_w(input);
+#endif
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	// Writes a vector4 to aligned memory.
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store_aligned(vector4f_arg0 input, float* output) RTM_NO_EXCEPT
+	{
+#if defined(RTM_SSE2_INTRINSICS)
+		_mm_store_ps(output, input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1q_f32(output, input);
 #else
 		output[0] = vector_get_x(input);
 		output[1] = vector_get_y(input);
@@ -1026,8 +1060,15 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store2(vector4f_arg0 input, float* output) RTM_NO_EXCEPT
 	{
+#if defined(RTM_SSE2_INTRINSICS)
 		output[0] = vector_get_x(input);
 		output[1] = vector_get_y(input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1_f32(output, *(float32x2_t*)&input);
+#else
+		output[0] = vector_get_x(input);
+		output[1] = vector_get_y(input);
+#endif
 	}
 
 	//////////////////////////////////////////////////////////////////////////
@@ -1035,11 +1076,22 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store3(vector4f_arg0 input, float* output) RTM_NO_EXCEPT
 	{
+#if defined(RTM_SSE2_INTRINSICS)
+		output[0] = vector_get_x(input);
+		output[1] = vector_get_y(input);
+		output[2] = vector_get_z(input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1_f32(output, *(float32x2_t*)&input);
+		vst1q_lane_f32(((float32_t*)output) + 2, input, 2);
+#else
 		output[0] = vector_get_x(input);
 		output[1] = vector_get_y(input);
 		output[2] = vector_get_z(input);
+#endif
 	}
 
+
+
 	//////////////////////////////////////////////////////////////////////////
 	// Writes a vector4 to unaligned memory.
 	//////////////////////////////////////////////////////////////////////////
@@ -1510,47 +1562,6 @@ namespace rtm
 #endif
 	}
 
-	//////////////////////////////////////////////////////////////////////////
-	// 3D cross product: lhs x rhs
-	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT
-	{
-#if defined(RTM_SSE2_INTRINSICS)
-		// cross(a, b).zxy = (a * b.yzx) - (a.yzx * b)
-		__m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1));
-		__m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1));
-		__m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs));
-
-		// cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx
-		return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1));
-#elif defined(RTM_NEON_INTRINSICS)
-		// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
-		float32x4_t lhs_yzwx = vextq_f32(lhs, lhs, 1);
-		float32x4_t rhs_wxyz = vextq_f32(rhs, rhs, 3);
-
-		float32x4_t lhs_yzx = vsetq_lane_f32(vgetq_lane_f32(lhs, 0), lhs_yzwx, 2);
-		float32x4_t rhs_zxy = vsetq_lane_f32(vgetq_lane_f32(rhs, 2), rhs_wxyz, 0);
-
-		// part_a = (a.yzx * b.zxy)
-		float32x4_t part_a = vmulq_f32(lhs_yzx, rhs_zxy);
-
-		float32x4_t lhs_wxyz = vextq_f32(lhs, lhs, 3);
-		float32x4_t rhs_yzwx = vextq_f32(rhs, rhs, 1);
-		float32x4_t lhs_zxy = vsetq_lane_f32(vgetq_lane_f32(lhs, 2), lhs_wxyz, 0);
-		float32x4_t rhs_yzx = vsetq_lane_f32(vgetq_lane_f32(rhs, 0), rhs_yzwx, 2);
-
-		return vmlsq_f32(part_a, lhs_zxy, rhs_yzx);
-#else
-		// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
-		const float lhs_x = vector_get_x(lhs);
-		const float lhs_y = vector_get_y(lhs);
-		const float lhs_z = vector_get_z(lhs);
-		const float rhs_x = vector_get_x(rhs);
-		const float rhs_y = vector_get_y(rhs);
-		const float rhs_z = vector_get_z(rhs);
-		return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x));
-#endif
-	}
 
 	namespace rtm_impl
 	{
@@ -2424,6 +2435,48 @@ namespace rtm
 	}
 #endif
 
+	//v2 - v0 * v1
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_neg_mul_add(vector4f_arg0 v0, vector4f_arg1 v1, vector4f_arg2 v2) RTM_NO_EXCEPT
+	{
+		return vector_neg_mul_sub(v0, v1, v2);
+	}
+
+
+	//////////////////////////////////////////////////////////////////////////
+	// 3D cross product: lhs x rhs
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT
+	{
+#if defined(RTM_SSE2_INTRINSICS)
+		// cross(a, b).zxy = (a * b.yzx) - (a.yzx * b)
+		__m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1));
+		__m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1));
+		__m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs));
+
+		// cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx
+		return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1));
+#elif defined(RTM_NEON_INTRINSICS)
+		// YZX
+		vector4f A = VECTOR_SWIZZLE(rhs, 1, 2, 0, 3);
+		vector4f B = VECTOR_SWIZZLE(lhs, 1, 2, 0, 3);
+		// XY, YZ, ZX
+		A = vector_mul(A, lhs);
+		// XY-YX, YZ-ZY, ZX-XZ
+		A = vector_neg_mul_add(B, rhs, A);
+		// YZ-ZY, ZX-XZ, XY-YX
+		return VECTOR_SWIZZLE(A, 1, 2, 0, 3);
+#else
+		// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
+		const float lhs_x = vector_get_x(lhs);
+		const float lhs_y = vector_get_y(lhs);
+		const float lhs_z = vector_get_z(lhs);
+		const float rhs_x = vector_get_x(rhs);
+		const float rhs_y = vector_get_y(rhs);
+		const float rhs_z = vector_get_z(rhs);
+		return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x));
+#endif
+	}
+
 	//////////////////////////////////////////////////////////////////////////
 	// Per component linear interpolation of the two inputs at the specified alpha.
 	// The formula used is: ((1.0 - alpha) * start) + (alpha * end).
@@ -3373,244 +3426,60 @@ namespace rtm
 	template<mix4 comp0, mix4 comp1, mix4 comp2, mix4 comp3>
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
 	{
-#if defined(RTM_SSE4_INTRINSICS)
-        // Each component comes from the respective position of input 0 or input 1
-        if (rtm_impl::static_condition<(comp0 == mix4::a || comp0 == mix4::x) && (comp1 == mix4::b || comp1 == mix4::y) &&
-                                       (comp2 == mix4::c || comp2 == mix4::z) && (comp3 == mix4::d || comp3 == mix4::w)>::test())
-        {
-            constexpr int mask = (comp0 == mix4::a ? 1 : 0) | (comp1 == mix4::b ? 2 : 0) |
-                                 (comp2 == mix4::c ? 4 : 0) | (comp3 == mix4::d ? 8 : 0);
-            return _mm_blend_ps(input0, input1, mask);
-        }
-
-        // First component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && comp1 == mix4::y && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return _mm_insert_ps(input0, input1, (int(comp0) % 4) << 6);
-
-        // Second component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && rtm_impl::is_mix_abcd(comp1) && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return _mm_insert_ps(input0, input1, ((int(comp1) % 4) << 6) | (1 << 4));
-
-        // Third component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && rtm_impl::is_mix_abcd(comp2) && comp3 == mix4::w>::test())
-            return _mm_insert_ps(input0, input1, ((int(comp2) % 4) << 6) | (2 << 4));
-
-        // Fourth component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && comp2 == mix4::z && rtm_impl::is_mix_abcd(comp3)>::test())
-            return _mm_insert_ps(input0, input1, ((int(comp3) % 4) << 6) | (3 << 4));
-
-        // First component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && comp1 == mix4::b && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return _mm_insert_ps(input1, input0, (int(comp0) % 4) << 6);
-
-        // Second component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && rtm_impl::is_mix_xyzw(comp1) && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return _mm_insert_ps(input1, input0, ((int(comp1) % 4) << 6) | (1 << 4));
-
-        // Third component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && rtm_impl::is_mix_xyzw(comp2) && comp3 == mix4::d>::test())
-            return _mm_insert_ps(input1, input0, ((int(comp2) % 4) << 6) | (2 << 4));
-
-        // Fourth component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && comp2 == mix4::c && rtm_impl::is_mix_xyzw(comp3)>::test())
-            return _mm_insert_ps(input1, input0, ((int(comp3) % 4) << 6) | (3 << 4));
-#endif // defined(RTM_SSE4_INTRINSICS)
-
-#if defined(RTM_SSE2_INTRINSICS)
-		// All four components come from input 0
-		if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && rtm_impl::is_mix_xyzw(comp1) && rtm_impl::is_mix_xyzw(comp2) && rtm_impl::is_mix_xyzw(comp3)>::test())
-			return _mm_shuffle_ps(input0, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
-
-		// All four components come from input 1
-		if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && rtm_impl::is_mix_abcd(comp1) && rtm_impl::is_mix_abcd(comp2) && rtm_impl::is_mix_abcd(comp3)>::test())
-			return _mm_shuffle_ps(input1, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
-
-		// First two components come from input 0, second two come from input 1
-		if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && rtm_impl::is_mix_xyzw(comp1) && rtm_impl::is_mix_abcd(comp2) && rtm_impl::is_mix_abcd(comp3)>::test())
-			return _mm_shuffle_ps(input0, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
-
-		// First two components come from input 1, second two come from input 0
-		if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && rtm_impl::is_mix_abcd(comp1) && rtm_impl::is_mix_xyzw(comp2) && rtm_impl::is_mix_xyzw(comp3)>::test())
-			return _mm_shuffle_ps(input1, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
-
-		// Low words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::a && comp2 == mix4::y && comp3 == mix4::b>::test())
-			return _mm_unpacklo_ps(input0, input1);
-
-		// Low words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::x && comp2 == mix4::b && comp3 == mix4::y>::test())
-			return _mm_unpacklo_ps(input1, input0);
-
-		// High words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::z && comp1 == mix4::c && comp2 == mix4::w && comp3 == mix4::d>::test())
-			return _mm_unpackhi_ps(input0, input1);
-
-		// High words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::c && comp1 == mix4::z && comp2 == mix4::d && comp3 == mix4::w>::test())
-			return _mm_unpackhi_ps(input1, input0);
-#endif	// defined(RTM_SSE2_INTRINSICS)
-
-#if defined(RTM_NEON64_INTRINSICS)
-        // Low words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::a && comp2 == mix4::y && comp3 == mix4::b>::test())
-            return vzip1q_f32(input0, input1);
-
-        // Low words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::x && comp2 == mix4::b && comp3 == mix4::y>::test())
-            return vzip1q_f32(input1, input0);
-
-        // High words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::z && comp1 == mix4::c && comp2 == mix4::w && comp3 == mix4::d>::test())
-            return vzip2q_f32(input0, input1);
-
-        // High words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::c && comp1 == mix4::z && comp2 == mix4::d && comp3 == mix4::w>::test())
-            return vzip2q_f32(input1, input0);
-
-        // Even-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::z && comp2 == mix4::a && comp3 == mix4::c>::test())
-            return vuzp1q_f32(input0, input1);
-
-        // Even-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::c && comp2 == mix4::x && comp3 == mix4::z>::test())
-            return vuzp1q_f32(input1, input0);
-
-        // Odd-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::y && comp1 == mix4::w && comp2 == mix4::b && comp3 == mix4::d>::test())
-            return vuzp2q_f32(input0, input1);
-
-        // Odd-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::b && comp1 == mix4::d && comp2 == mix4::y && comp3 == mix4::w>::test())
-            return vuzp2q_f32(input1, input0);
-
-        // Even-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::a && comp2 == mix4::z && comp3 == mix4::c>::test())
-            return vtrn1q_f32(input0, input1);
-
-        // Even-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::x && comp2 == mix4::c && comp3 == mix4::z>::test())
-            return vtrn1q_f32(input1, input0);
-
-        // Odd-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::y && comp1 == mix4::b && comp2 == mix4::w && comp3 == mix4::d>::test())
-            return vtrn2q_f32(input0, input1);
-
-        // Odd-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::b && comp1 == mix4::y && comp2 == mix4::d && comp3 == mix4::w>::test())
-            return vtrn2q_f32(input1, input0);
-#endif // defined(RTM_NEON64_INTRINSICS)
-
-#if defined(RTM_NEON_INTRINSICS)
-        // The highest vector elements from input 0 and the lowest vector elements from input 1, consecutively
-        if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) &&
-                                       int(comp0) + 1 == int(comp1) &&
-                                       int(comp1) + 1 == int(comp2) &&
-                                       int(comp2) + 1 == int(comp3)>::test())
-            return vextq_f32(input0, input1, int(comp0) % 4);
-
-        // The highest vector elements from input 1 and the lowest vector elements from input 0, consecutively
-        if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) &&
-                                       (int(comp0) + 1) % 8 == int(comp1) &&
-                                       (int(comp1) + 1) % 8 == int(comp2) &&
-                                       (int(comp2) + 1) % 8 == int(comp3)>::test())
-            return vextq_f32(input1, input0, int(comp0) % 4);
-
-        // All four components come from input 0, reversed order in each doubleword
-        if (rtm_impl::static_condition<comp0 == mix4::y && comp1 == mix4::x && comp2 == mix4::w && comp3 == mix4::z>::test())
-            return vrev64q_f32(input0);
-
-        // All four components come from input 1, reversed order in each doubleword
-        if (rtm_impl::static_condition<comp0 == mix4::b && comp1 == mix4::a && comp2 == mix4::d && comp3 == mix4::c>::test())
-            return vrev64q_f32(input1);
-
-        // First component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && comp1 == mix4::y && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp0) % 4), input0, 0);
-
-        // Second component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && rtm_impl::is_mix_abcd(comp1) && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp1) % 4), input0, 1);
-
-        // Third component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && rtm_impl::is_mix_abcd(comp2) && comp3 == mix4::w>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp2) % 4), input0, 2);
-
-        // Fourth component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && comp2 == mix4::z && rtm_impl::is_mix_abcd(comp3)>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp3) % 4), input0, 3);
-
-        // First component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && comp1 == mix4::b && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp0) % 4), input1, 0);
-
-        // Second component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && rtm_impl::is_mix_xyzw(comp1) && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp1) % 4), input1, 1);
-
-        // Third component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && rtm_impl::is_mix_xyzw(comp2) && comp3 == mix4::d>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp2) % 4), input1, 2);
-
-        // Fourth component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && comp2 == mix4::c && rtm_impl::is_mix_xyzw(comp3)>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp3) % 4), input1, 3);
-
-
-        // All comes from the same position
-        if (rtm_impl::static_condition<comp0 == comp1 && comp0 == comp2 && comp0 == comp3>::test()) {
-            // All comes from the same position of input0
-            if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0)>::test())
-                return vmovq_n_f32(vgetq_lane_f32(input0, int(comp0) % 4));
-            // All comes from the same position of input1
-            if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0)>::test())
-                return vmovq_n_f32(vgetq_lane_f32(input1, int(comp0) % 4));
-        }
-#endif // defined(RTM_NEON_INTRINSICS)
-
-        // Slow code path, not yet optimized or not using intrinsics
-		constexpr component4 component0 = rtm_impl::mix_to_component(comp0);
-		constexpr component4 component1 = rtm_impl::mix_to_component(comp1);
-		constexpr component4 component2 = rtm_impl::mix_to_component(comp2);
-		constexpr component4 component3 = rtm_impl::mix_to_component(comp3);
-
-		const float x0 = vector_get_component(input0, component0);
-		const float x1 = vector_get_component(input1, component0);
-		const float x = rtm_impl::is_mix_xyzw(comp0) ? x0 : x1;
-
-		const float y0 = vector_get_component(input0, component1);
-		const float y1 = vector_get_component(input1, component1);
-		const float y = rtm_impl::is_mix_xyzw(comp1) ? y0 : y1;
-
-		const float z0 = vector_get_component(input0, component2);
-		const float z1 = vector_get_component(input1, component2);
-		const float z = rtm_impl::is_mix_xyzw(comp2) ? z0 : z1;
-
-		const float w0 = vector_get_component(input0, component3);
-		const float w1 = vector_get_component(input1, component3);
-		const float w = rtm_impl::is_mix_xyzw(comp3) ? w0 : w1;
-
-		return vector_set(x, y, z, w);
+		constexpr int index0 = (int)comp0;
+		constexpr int index1 = (int)comp1;
+		constexpr int index2 = (int)comp2;
+		constexpr int index3 = (int)comp3;
+#if defined(__clang__)
+		return __builtin_shufflevector(input0, input1, index0, index1, index2, index3);
+#else
+		if constexpr (index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4) {
+			return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4);
+		}
+		else if constexpr(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) {
+			//no input1 use here
+			return VECTOR_SWIZZLE(input0, index0, index1, index2, index3);
+		}
+		else if constexpr(index0 >=4 && index1 >=4 && index2 >=4 && index3 >=4) {
+			//no input0 use here
+			return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 -4);
+		}else {
+
+			float combine_arr[8];
+			vector_store(input0, combine_arr);
+			vector_store(input1, combine_arr + 4);
+			return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]);
+		}
+#endif
 	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [x] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::x, mix4::x, mix4::x, mix4::x>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT {
+		return VECTOR_REPLICATE(input, 0);
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [y] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::y, mix4::y, mix4::y, mix4::y>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { 
+		return VECTOR_REPLICATE(input, 1);
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [z] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::z, mix4::z, mix4::z, mix4::z>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { 
+		return VECTOR_REPLICATE(input, 2);
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [w] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::w, mix4::w, mix4::w, mix4::w>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { 
+		return VECTOR_REPLICATE(input, 3);
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Logical