diff --git a/examples_tests b/examples_tests
index 0f230e1f18..071b862dc7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 0f230e1f1834c04101957f862688ccc4d2002922
+Subproject commit 071b862dc71f6ad61d191924d72dcc00d313e9d0
diff --git a/include/ICameraSceneNode.h b/include/ICameraSceneNode.h
index e3975e3802..014c063bf2 100644
--- a/include/ICameraSceneNode.h
+++ b/include/ICameraSceneNode.h
@@ -46,17 +46,17 @@ class ICameraSceneNode : public ISceneNode
 		The function will figure it out if you've set an orthogonal matrix.
 		\param projection The new projection matrix of the camera.
 		*/
-		virtual void setProjectionMatrix(const core::matrix4SIMD& projection) =0;
+		virtual void setProjectionMatrix(const hlsl::float32_t4x4& projection) =0;
 
 		//! Gets the current projection matrix of the camera.
 		/** \return The current projection matrix of the camera. */
-		inline const core::matrix4SIMD& getProjectionMatrix() const { return projMatrix; }
+		inline const hlsl::float32_t4x4& getProjectionMatrix() const { return projMatrix; }
 
 		//! Gets the current view matrix of the camera.
 		/** \return The current view matrix of the camera. */
-		virtual const core::matrix3x4SIMD& getViewMatrix() const =0;
+		virtual const hlsl::float32_t3x4& getViewMatrix() const =0;
 
-		virtual const core::matrix4SIMD& getConcatenatedMatrix() const =0;
+		virtual const hlsl::float32_t4x4& getConcatenatedMatrix() const =0;
 #if 0
 		//! It is possible to send mouse and key events to the camera.
 		/** Most cameras may ignore this input, but camera scene nodes
diff --git a/include/matrix3x4SIMD.h b/include/matrix3x4SIMD.h
deleted file mode 100644
index d52f305cec..0000000000
--- a/include/matrix3x4SIMD.h
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_MATRIX3X4SIMD_H_INCLUDED__
-#define __NBL_MATRIX3X4SIMD_H_INCLUDED__
-
-#include "vectorSIMD.h"
-#include "quaternion.h"
-
-namespace nbl::core
-{
-
-class matrix4x3;
-
-#define _NBL_MATRIX_ALIGNMENT _NBL_SIMD_ALIGNMENT
-static_assert(_NBL_MATRIX_ALIGNMENT>=_NBL_VECTOR_ALIGNMENT,"Matrix must be equally or more aligned than vector!");
-
-//! Equivalent of GLSL's mat4x3
-class matrix3x4SIMD// : private AllocationOverrideBase<_NBL_MATRIX_ALIGNMENT> EBO inheritance problem w.r.t `rows[3]`
-{
-	public:
-		_NBL_STATIC_INLINE_CONSTEXPR uint32_t VectorCount = 3u;
-		vectorSIMDf rows[VectorCount];
-
-		explicit matrix3x4SIMD(	const vectorSIMDf& _r0 = vectorSIMDf(1.f, 0.f, 0.f, 0.f),
-								const vectorSIMDf& _r1 = vectorSIMDf(0.f, 1.f, 0.f, 0.f),
-								const vectorSIMDf& _r2 = vectorSIMDf(0.f, 0.f, 1.f, 0.f)) : rows{_r0, _r1, _r2}
-		{
-		}
-
-		matrix3x4SIMD(	float _a00, float _a01, float _a02, float _a03,
-						float _a10, float _a11, float _a12, float _a13,
-						float _a20, float _a21, float _a22, float _a23)
-								: matrix3x4SIMD(vectorSIMDf(_a00, _a01, _a02, _a03),
-												vectorSIMDf(_a10, _a11, _a12, _a13),
-												vectorSIMDf(_a20, _a21, _a22, _a23))
-		{
-		}
-
-		explicit matrix3x4SIMD(const float* const _data)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4*i);
-		}
-		matrix3x4SIMD(const float* const _data, bool ALIGNED)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4*i, ALIGNED);
-		}
-
-		float* pointer() { return rows[0].pointer; }
-		const float* pointer() const { return rows[0].pointer; }
-
-		inline matrix3x4SIMD& set(const matrix4x3& _retarded);
-		inline matrix4x3 getAsRetardedIrrlichtMatrix() const;
-
-		static inline matrix3x4SIMD concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b);
-
-		static inline matrix3x4SIMD concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b);
-
-		inline matrix3x4SIMD& concatenateAfter(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByA(*this, _other);
-		}
-
-		inline matrix3x4SIMD& concatenateBefore(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByA(_other, *this);
-		}
-
-		inline matrix3x4SIMD& concatenateAfterPrecisely(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByAPrecisely(*this, _other);
-		}
-
-		inline matrix3x4SIMD& concatenateBeforePrecisely(const matrix3x4SIMD& _other)
-		{
-			return *this = concatenateBFollowedByAPrecisely(_other, *this);
-		}
-
-		inline bool operator==(const matrix3x4SIMD& _other)
-		{
-			return !(*this != _other);
-		}
-
-		inline bool operator!=(const matrix3x4SIMD& _other);
-
-
-		inline matrix3x4SIMD operator-() const
-		{
-			matrix3x4SIMD retval;
-			retval.rows[0] = -rows[0];
-			retval.rows[1] = -rows[1];
-			retval.rows[2] = -rows[2];
-			return retval;
-		}
-
-
-		inline matrix3x4SIMD& operator+=(const matrix3x4SIMD& _other);
-		inline matrix3x4SIMD operator+(const matrix3x4SIMD& _other) const
-		{
-			matrix3x4SIMD retval(*this);
-			return retval += _other;
-		}
-
-		inline matrix3x4SIMD& operator-=(const matrix3x4SIMD& _other);
-		inline matrix3x4SIMD operator-(const matrix3x4SIMD& _other) const
-		{
-			matrix3x4SIMD retval(*this);
-			return retval -= _other;
-		}
-
-		inline matrix3x4SIMD& operator*=(float _scalar);
-		inline matrix3x4SIMD operator*(float _scalar) const
-		{
-			matrix3x4SIMD retval(*this);
-			return retval *= _scalar;
-		}
-
-		inline matrix3x4SIMD& setTranslation(const vectorSIMDf& _translation)
-		{
-			// no faster way of doing it?
-			rows[0].w = _translation.x;
-			rows[1].w = _translation.y;
-			rows[2].w = _translation.z;
-			return *this;
-		}
-		inline vectorSIMDf getTranslation() const;
-		inline vectorSIMDf getTranslation3D() const;
-
-		inline matrix3x4SIMD& setScale(const vectorSIMDf& _scale);
-
-		inline vectorSIMDf getScale() const;
-
-		inline void transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void transformVect(vectorSIMDf& _in_out) const
-		{
-			transformVect(_in_out, _in_out);
-		}
-
-		inline void pseudoMulWith4x1(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void pseudoMulWith4x1(vectorSIMDf& _in_out) const
-		{
-			pseudoMulWith4x1(_in_out,_in_out);
-		}
-
-		inline void mulSub3x3WithNx1(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void mulSub3x3WithNx1(vectorSIMDf& _in_out) const
-		{
-			mulSub3x3WithNx1(_in_out, _in_out);
-		}
-
-		inline static matrix3x4SIMD buildCameraLookAtMatrixLH(
-			const vectorSIMDf& position,
-			const vectorSIMDf& target,
-			const vectorSIMDf& upVector);
-		inline static matrix3x4SIMD buildCameraLookAtMatrixRH(
-			const vectorSIMDf& position,
-			const vectorSIMDf& target,
-			const vectorSIMDf& upVector);
-
-		inline matrix3x4SIMD& setRotation(const quaternion& _quat);
-
-		inline matrix3x4SIMD& setScaleRotationAndTranslation(	const vectorSIMDf& _scale,
-																const quaternion& _quat,
-																const vectorSIMDf& _translation);
-
-		inline vectorSIMDf getPseudoDeterminant() const
-		{
-			vectorSIMDf tmp;
-			return determinant_helper(tmp);
-		}
-
-		inline bool getInverse(matrix3x4SIMD& _out) const;
-		bool makeInverse()
-		{
-			matrix3x4SIMD tmp;
-
-			if (getInverse(tmp))
-			{
-				*this = tmp;
-				return true;
-			}
-			return false;
-		}
-
-		//
-		inline bool getSub3x3InverseTranspose(matrix3x4SIMD& _out) const;
-
-		//
-		inline bool getSub3x3InverseTransposePacked(float outRows[9]) const
-		{
-			matrix3x4SIMD tmp;
-			if (!getSub3x3InverseTranspose(tmp))
-				return false;
-
-			float* _out = outRows;
-			for (auto i=0; i<3; i++)
-			{
-				const auto& row = tmp.rows[i];
-				for (auto j=0; j<3; j++)
-					*(_out++) = row[j];
-			}
-
-			return true;
-		}
-
-		//
-		inline core::matrix3x4SIMD getSub3x3TransposeCofactors() const;
-
-		//
-		inline void setTransformationCenter(const vectorSIMDf& _center, const vectorSIMDf& _translation);
-
-		//
-		static inline matrix3x4SIMD buildAxisAlignedBillboard(
-			const vectorSIMDf& camPos,
-			const vectorSIMDf& center,
-			const vectorSIMDf& translation,
-			const vectorSIMDf& axis,
-			const vectorSIMDf& from);
-
-
-		//
-		float& operator()(size_t _i, size_t _j) { return rows[_i].pointer[_j]; }
-		const float& operator()(size_t _i, size_t _j) const { return rows[_i].pointer[_j]; }
-
-		//
-		inline const vectorSIMDf& operator[](size_t _rown) const { return rows[_rown]; }
-		inline vectorSIMDf& operator[](size_t _rown) { return rows[_rown]; }
-
-	private:
-		static inline vectorSIMDf doJob(const __m128& a, const matrix3x4SIMD& _mtx);
-
-		// really need that dvec<2> or wider
-		inline __m128d halfRowAsDouble(size_t _n, bool _0) const;
-		static inline __m128d doJob_d(const __m128d& _a0, const __m128d& _a1, const matrix3x4SIMD& _mtx, bool _xyHalf);
-
-		vectorSIMDf determinant_helper(vectorSIMDf& r1crossr2) const
-		{
-			r1crossr2 = core::cross(rows[1], rows[2]);
-			return core::dot(rows[0], r1crossr2);
-		}
-};
-
-inline matrix3x4SIMD concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return matrix3x4SIMD::concatenateBFollowedByA(_a, _b);
-}
-/*
-inline matrix3x4SIMD concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return matrix3x4SIMD::concatenateBFollowedByAPrecisely(_a, _b);
-}
-*/
-
-}
-
-#endif
diff --git a/include/matrix3x4SIMD_impl.h b/include/matrix3x4SIMD_impl.h
deleted file mode 100644
index 0e9022efd0..0000000000
--- a/include/matrix3x4SIMD_impl.h
+++ /dev/null
@@ -1,470 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef _NBL_MATRIX3X4SIMD_IMPL_H_INCLUDED_
-#define _NBL_MATRIX3X4SIMD_IMPL_H_INCLUDED_
-
-#include "matrix3x4SIMD.h"
-#include "nbl/core/math/glslFunctions.tcc"
-
-namespace nbl::core
-{
-
-// TODO: move to another implementation header
-inline quaternion::quaternion(const matrix3x4SIMD& m)
-{
-	const vectorSIMDf one(1.f);
-	auto Qx  = m.rows[0].xxxx()^vectorSIMDu32(0,0,0x80000000u,0x80000000u);
-	auto Qy  = m.rows[1].yyyy()^vectorSIMDu32(0,0x80000000u,0,0x80000000u);
-	auto Qz  = m.rows[2].zzzz()^vectorSIMDu32(0,0x80000000u,0x80000000u,0);
-
-	auto tmp = one+Qx+Qy+Qz;
-	auto invscales = inversesqrt(tmp)*0.5f;
-	auto scales = tmp*invscales*0.5f;
-
-	// TODO: speed this up
-	if (tmp.x > 0.0f)
-	{
-		X = (m(2, 1) - m(1, 2)) * invscales.x;
-		Y = (m(0, 2) - m(2, 0)) * invscales.x;
-		Z = (m(1, 0) - m(0, 1)) * invscales.x;
-		W = scales.x;
-	}
-	else
-	{
-		if (tmp.y>0.f)
-		{
-			X = scales.y;
-			Y = (m(0, 1) + m(1, 0)) * invscales.y;
-			Z = (m(2, 0) + m(0, 2)) * invscales.y;
-			W = (m(2, 1) - m(1, 2)) * invscales.y;
-		}
-		else if (tmp.z>0.f)
-		{
-			X = (m(0, 1) + m(1, 0)) * invscales.z;
-			Y = scales.z;
-			Z = (m(1, 2) + m(2, 1)) * invscales.z;
-			W = (m(0, 2) - m(2, 0)) * invscales.z;
-		}
-		else
-		{
-			X = (m(0, 2) + m(2, 0)) * invscales.w;
-			Y = (m(1, 2) + m(2, 1)) * invscales.w;
-			Z = scales.w;
-			W = (m(1, 0) - m(0, 1)) * invscales.w;
-		}
-	}
-
-	*this = normalize(*this);
-}
-
-inline bool matrix3x4SIMD::operator!=(const matrix3x4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		if ((rows[i] != _other.rows[i]).any())
-			return true;
-	return false;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::operator+=(const matrix3x4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] += _other.rows[i];
-	return *this;
-}
-inline matrix3x4SIMD& matrix3x4SIMD::operator-=(const matrix3x4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] -= _other.rows[i];
-	return *this;
-}
-inline matrix3x4SIMD& matrix3x4SIMD::operator*=(float _scalar)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] *= _scalar;
-	return *this;
-}
-
-#ifdef __NBL_COMPILE_WITH_SSE3
-#define BROADCAST32(fpx) _MM_SHUFFLE(fpx, fpx, fpx, fpx)
-#define BUILD_XORMASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_ ? 0x80000000u:0x0u, _y_ ? 0x80000000u:0x0u, _z_ ? 0x80000000u:0x0u, _w_ ? 0x80000000u:0x0u)
-#define BUILD_MASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_*0xffffffff, _y_*0xffffffff, _z_*0xffffffff, _w_*0xffffffff)
-
-inline matrix3x4SIMD matrix3x4SIMD::concatenateBFollowedByA(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-#ifdef _NBL_DEBUG
-	assert(is_aligned_to(&_a, _NBL_SIMD_ALIGNMENT));
-	assert(is_aligned_to(&_b, _NBL_SIMD_ALIGNMENT));
-#endif // _NBL_DEBUG
-	__m128 r0 = _a.rows[0].getAsRegister();
-	__m128 r1 = _a.rows[1].getAsRegister();
-	__m128 r2 = _a.rows[2].getAsRegister();
-
-	matrix3x4SIMD out;
-	out.rows[0] = matrix3x4SIMD::doJob(r0, _b);
-	out.rows[1] = matrix3x4SIMD::doJob(r1, _b);
-	out.rows[2] = matrix3x4SIMD::doJob(r2, _b);
-
-	return out;
-}
-
-inline matrix3x4SIMD matrix3x4SIMD::concatenateBFollowedByAPrecisely(const matrix3x4SIMD& _a, const matrix3x4SIMD& _b)
-{
-	__m128d r00 = _a.halfRowAsDouble(0u, true);
-	__m128d r01 = _a.halfRowAsDouble(0u, false);
-	__m128d r10 = _a.halfRowAsDouble(1u, true);
-	__m128d r11 = _a.halfRowAsDouble(1u, false);
-	__m128d r20 = _a.halfRowAsDouble(2u, true);
-	__m128d r21 = _a.halfRowAsDouble(2u, false);
-
-	matrix3x4SIMD out;
-
-	const __m128i mask0011 = BUILD_MASKF(0, 0, 1, 1);
-
-	__m128 second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r00, r01, _b, false));
-	out.rows[0] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r00, r01, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-
-	second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r10, r11, _b, false));
-	out.rows[1] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r10, r11, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-
-	second = _mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r20, r21, _b, false));
-	out.rows[2] = vectorSIMDf(_mm_cvtpd_ps(matrix3x4SIMD::doJob_d(r20, r21, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-
-	return out;
-}
-
-inline vectorSIMDf matrix3x4SIMD::getTranslation() const
-{
-	__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setr_ps(0.f, 0.f, 0.f, 1.f)); // (2z,3z,2w,3w)
-	__m128 xmm2 = _mm_movehl_ps(xmm1, xmm0);// (0w,1w,2w,3w)
-
-	return xmm2;
-}
-inline vectorSIMDf matrix3x4SIMD::getTranslation3D() const
-{
-	__m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setzero_ps()); // (2z,0,2w,0)
-	__m128 xmm2 = _mm_movehl_ps(xmm1, xmm0);// (0w,1w,2w,0)
-
-	return xmm2;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::setScale(const core::vectorSIMDf& _scale)
-{
-	const vectorSIMDu32 mask0001 = vectorSIMDu32(BUILD_MASKF(0, 0, 0, 1));
-	const vectorSIMDu32 mask0010 = vectorSIMDu32(BUILD_MASKF(0, 0, 1, 0));
-	const vectorSIMDu32 mask0100 = vectorSIMDu32(BUILD_MASKF(0, 1, 0, 0));
-	const vectorSIMDu32 mask1000 = vectorSIMDu32(BUILD_MASKF(1, 0, 0, 0));
-
-	const vectorSIMDu32& scaleAlias = reinterpret_cast<const vectorSIMDu32&>(_scale);
-
-	vectorSIMDu32& rowAlias0 = reinterpret_cast<vectorSIMDu32&>(rows[0]);
-	vectorSIMDu32& rowAlias1 = reinterpret_cast<vectorSIMDu32&>(rows[1]);
-	vectorSIMDu32& rowAlias2 = reinterpret_cast<vectorSIMDu32&>(rows[2]);
-	rowAlias0 = (scaleAlias & reinterpret_cast<const vectorSIMDf&>(mask1000)) | (rowAlias0 & reinterpret_cast<const vectorSIMDf&>(mask0001));
-	rowAlias1 = (scaleAlias & reinterpret_cast<const vectorSIMDf&>(mask0100)) | (rowAlias1 & reinterpret_cast<const vectorSIMDf&>(mask0001));
-	rowAlias2 = (scaleAlias & reinterpret_cast<const vectorSIMDf&>(mask0010)) | (rowAlias2 & reinterpret_cast<const vectorSIMDf&>(mask0001));
-
-	return *this;
-}
-
-inline core::vectorSIMDf matrix3x4SIMD::getScale() const
-{
-	// xmm4-7 will now become columuns of B
-	__m128 xmm4 = rows[0].getAsRegister();
-	__m128 xmm5 = rows[1].getAsRegister();
-	__m128 xmm6 = rows[2].getAsRegister();
-	__m128 xmm7 = _mm_setzero_ps();
-	// g==0
-	__m128 xmm0 = _mm_unpacklo_ps(xmm4, xmm5);
-	__m128 xmm1 = _mm_unpacklo_ps(xmm6, xmm7); // (2x,g,2y,g)
-	__m128 xmm2 = _mm_unpackhi_ps(xmm4, xmm5);
-	__m128 xmm3 = _mm_unpackhi_ps(xmm6, xmm7); // (2z,g,2w,g)
-	xmm4 = _mm_movelh_ps(xmm1, xmm0); //(0x,1x,2x,g)
-	xmm5 = _mm_movehl_ps(xmm1, xmm0);
-	xmm6 = _mm_movelh_ps(xmm3, xmm2); //(0z,1z,2z,g)
-
-	// See http://www.robertblum.com/articles/2005/02/14/decomposing-matrices
-	// We have to do the full calculation.
-	xmm0 = _mm_mul_ps(xmm4, xmm4);// column 0 squared
-	xmm1 = _mm_mul_ps(xmm5, xmm5);// column 1 squared
-	xmm2 = _mm_mul_ps(xmm6, xmm6);// column 2 squared
-	xmm4 = _mm_hadd_ps(xmm0, xmm1);
-	xmm5 = _mm_hadd_ps(xmm2, xmm7);
-	xmm6 = _mm_hadd_ps(xmm4, xmm5);
-
-	return _mm_sqrt_ps(xmm6);
-}
-
-inline void matrix3x4SIMD::transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	vectorSIMDf r0 = rows[0] * _in,
-		r1 = rows[1] * _in,
-		r2 = rows[2] * _in;
-
-	_out =
-		_mm_hadd_ps(
-			_mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()),
-			_mm_hadd_ps(r2.getAsRegister(), _mm_set1_ps(0.25f))
-		);
-}
-
-inline void matrix3x4SIMD::pseudoMulWith4x1(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	__m128i mask1110 = BUILD_MASKF(1, 1, 1, 0);
-	_out = (_in & mask1110) | _mm_castps_si128(vectorSIMDf(0.f, 0.f, 0.f, 1.f).getAsRegister());
-	transformVect(_out);
-}
-
-inline void matrix3x4SIMD::mulSub3x3WithNx1(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	auto maskedIn = _in & BUILD_MASKF(1, 1, 1, 0);
-	vectorSIMDf r0 = rows[0] * maskedIn,
-		r1 = rows[1] * maskedIn,
-		r2 = rows[2] * maskedIn;
-
-	_out =
-		_mm_hadd_ps(
-			_mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()),
-			_mm_hadd_ps(r2.getAsRegister(), _mm_setzero_ps())
-		);
-}
-
-
-inline matrix3x4SIMD matrix3x4SIMD::buildCameraLookAtMatrixLH(
-	const core::vectorSIMDf& position,
-	const core::vectorSIMDf& target,
-	const core::vectorSIMDf& upVector)
-{
-	const core::vectorSIMDf zaxis = core::normalize(target - position);
-	const core::vectorSIMDf xaxis = core::normalize(core::cross(upVector, zaxis));
-	const core::vectorSIMDf yaxis = core::cross(zaxis, xaxis);
-
-	matrix3x4SIMD r;
-	r.rows[0] = xaxis;
-	r.rows[1] = yaxis;
-	r.rows[2] = zaxis;
-	r.rows[0].w = -dot(xaxis, position)[0];
-	r.rows[1].w = -dot(yaxis, position)[0];
-	r.rows[2].w = -dot(zaxis, position)[0];
-
-	return r;
-}
-inline matrix3x4SIMD matrix3x4SIMD::buildCameraLookAtMatrixRH(
-	const core::vectorSIMDf& position,
-	const core::vectorSIMDf& target,
-	const core::vectorSIMDf& upVector)
-{
-	const core::vectorSIMDf zaxis = core::normalize(position - target);
-	const core::vectorSIMDf xaxis = core::normalize(core::cross(upVector, zaxis));
-	const core::vectorSIMDf yaxis = core::cross(zaxis, xaxis);
-
-	matrix3x4SIMD r;
-	r.rows[0] = xaxis;
-	r.rows[1] = yaxis;
-	r.rows[2] = zaxis;
-	r.rows[0].w = -dot(xaxis, position)[0];
-	r.rows[1].w = -dot(yaxis, position)[0];
-	r.rows[2].w = -dot(zaxis, position)[0];
-
-	return r;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::setRotation(const core::quaternion& _quat)
-{
-	const vectorSIMDu32 mask0001 = vectorSIMDu32(BUILD_MASKF(0, 0, 0, 1));
-	const __m128i mask1110 = BUILD_MASKF(1, 1, 1, 0);
-
-	const core::vectorSIMDf& quat = reinterpret_cast<const core::vectorSIMDf&>(_quat);
-	rows[0] = ((quat.yyyy() * ((quat.yxwx() & mask1110) * vectorSIMDf(2.f))) + (quat.zzzz() * (quat.zwxx() & mask1110) * vectorSIMDf(2.f, -2.f, 2.f, 0.f))) | (reinterpret_cast<const vectorSIMDu32&>(rows[0]) & (mask0001));
-	rows[0].x = 1.f - rows[0].x;
-
-	rows[1] = ((quat.zzzz() * ((quat.wzyx() & mask1110) * vectorSIMDf(2.f))) + (quat.xxxx() * (quat.yxwx() & mask1110) * vectorSIMDf(2.f, 2.f, -2.f, 0.f))) | (reinterpret_cast<const vectorSIMDu32&>(rows[1]) & (mask0001));
-	rows[1].y = 1.f - rows[1].y;
-
-	rows[2] = ((quat.xxxx() * ((quat.zwxx() & mask1110) * vectorSIMDf(2.f))) + (quat.yyyy() * (quat.wzyx() & mask1110) * vectorSIMDf(-2.f, 2.f, 2.f, 0.f))) | (reinterpret_cast<const vectorSIMDu32&>(rows[2]) & (mask0001));
-	rows[2].z = 1.f - rows[2].z;
-
-	return *this;
-}
-
-inline matrix3x4SIMD& matrix3x4SIMD::setScaleRotationAndTranslation(const vectorSIMDf& _scale, const core::quaternion& _quat, const vectorSIMDf& _translation)
-{
-	const __m128i mask1110 = BUILD_MASKF(1, 1, 1, 0);
-
-	const vectorSIMDf& quat = reinterpret_cast<const vectorSIMDf&>(_quat);
-	const vectorSIMDf dblScale = (_scale * 2.f) & mask1110;
-
-	vectorSIMDf mlt = dblScale ^ BUILD_XORMASKF(0, 1, 0, 0);
-	rows[0] = ((quat.yyyy() * ((quat.yxwx() & mask1110) * dblScale)) + (quat.zzzz() * (quat.zwxx() & mask1110) * mlt));
-	rows[0].x = _scale.x - rows[0].x;
-
-	mlt = dblScale ^ BUILD_XORMASKF(0, 0, 1, 0);
-	rows[1] = ((quat.zzzz() * ((quat.wzyx() & mask1110) * dblScale)) + (quat.xxxx() * (quat.yxwx() & mask1110) * mlt));
-	rows[1].y = _scale.y - rows[1].y;
-
-	mlt = dblScale ^ BUILD_XORMASKF(1, 0, 0, 0);
-	rows[2] = ((quat.xxxx() * ((quat.zwxx() & mask1110) * dblScale)) + (quat.yyyy() * (quat.wzyx() & mask1110) * mlt));
-	rows[2].z = _scale.z - rows[2].z;
-
-	setTranslation(_translation);
-
-	return *this;
-}
-
-
-inline bool matrix3x4SIMD::getInverse(matrix3x4SIMD& _out) const //! SUBOPTIMAL - OPTIMIZE!
-{
-	auto translation = getTranslation();
-	// `tmp` will have columns in its `rows`
-	core::matrix4SIMD tmp;
-	auto* cols = tmp.rows;
-	if (!getSub3x3InverseTranspose(reinterpret_cast<core::matrix3x4SIMD&>(tmp)))
-		return false;
-
-	// find inverse post-translation
-	cols[3] = -cols[0]*translation.xxxx()-cols[1]*translation.yyyy()-cols[2]*translation.zzzz();
-
-	// columns into rows
-	_out = transpose(tmp).extractSub3x4();
-
-	return true;
-}
-
-inline bool matrix3x4SIMD::getSub3x3InverseTranspose(core::matrix3x4SIMD& _out) const
-{
-	vectorSIMDf r1crossr2;
-	const vectorSIMDf d = determinant_helper(r1crossr2);
-	if (core::iszero(d.x, FLT_MIN))
-		return false;
-	auto rcp = core::reciprocal(d);
-
-	// matrix of cofactors * 1/det
-	_out = getSub3x3TransposeCofactors();
-	_out.rows[0] *= rcp;
-	_out.rows[1] *= rcp;
-	_out.rows[2] *= rcp;
-
-	return true;
-}
-
-inline core::matrix3x4SIMD matrix3x4SIMD::getSub3x3TransposeCofactors() const
-{
-	core::matrix3x4SIMD _out;
-	_out.rows[0] = core::cross(rows[1], rows[2]);
-	_out.rows[1] = core::cross(rows[2], rows[0]);
-	_out.rows[2] = core::cross(rows[0], rows[1]);
-	return _out;
-}
-
-// TODO: Double check this!-
-inline void matrix3x4SIMD::setTransformationCenter(const core::vectorSIMDf& _center, const core::vectorSIMDf& _translation)
-{
-	core::vectorSIMDf r0 = rows[0] * _center;
-	core::vectorSIMDf r1 = rows[1] * _center;
-	core::vectorSIMDf r2 = rows[2] * _center;
-	core::vectorSIMDf r3(0.f, 0.f, 0.f, 1.f);
-
-	__m128 col3 = _mm_hadd_ps(_mm_hadd_ps(r0.getAsRegister(), r1.getAsRegister()), _mm_hadd_ps(r2.getAsRegister(), r3.getAsRegister()));
-	const vectorSIMDf vcol3 = _center - _translation - col3;
-
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i].w = vcol3.pointer[i];
-}
-
-
-// TODO: Double check this!
-inline matrix3x4SIMD matrix3x4SIMD::buildAxisAlignedBillboard(
-	const core::vectorSIMDf& camPos,
-	const core::vectorSIMDf& center,
-	const core::vectorSIMDf& translation,
-	const core::vectorSIMDf& axis,
-	const core::vectorSIMDf& from)
-{
-	// axis of rotation
-	const core::vectorSIMDf up = core::normalize(axis);
-	const core::vectorSIMDf forward = core::normalize(camPos - center);
-	const core::vectorSIMDf right = core::normalize(core::cross(up, forward));
-
-	// correct look vector
-	const core::vectorSIMDf look = core::cross(right, up);
-
-	// rotate from to
-	// axis multiplication by sin
-	const core::vectorSIMDf vs = core::cross(look, from);
-
-	// cosinus angle
-	const core::vectorSIMDf ca = core::cross(from, look);
-
-	const core::vectorSIMDf vt(up * (core::vectorSIMDf(1.f) - ca));
-	const core::vectorSIMDf wt = vt * up.yzxx();
-	const core::vectorSIMDf vtuppca = vt * up + ca;
-
-	matrix3x4SIMD mat;
-	core::vectorSIMDf& row0 = mat.rows[0];
-	core::vectorSIMDf& row1 = mat.rows[1];
-	core::vectorSIMDf& row2 = mat.rows[2];
-
-	row0 = vtuppca & BUILD_MASKF(1, 0, 0, 0);
-	row1 = vtuppca & BUILD_MASKF(0, 1, 0, 0);
-	row2 = vtuppca & BUILD_MASKF(0, 0, 1, 0);
-
-	row0 += (wt.xxzx() + vs.xzyx() * core::vectorSIMDf(1.f, 1.f, -1.f, 1.f)) & BUILD_MASKF(0, 1, 1, 0);
-	row1 += (wt.xxyx() + vs.zxxx() * core::vectorSIMDf(-1.f, 1.f, 1.f, 1.f)) & BUILD_MASKF(1, 0, 1, 0);
-	row2 += (wt.zyxx() + vs.yxxx() * core::vectorSIMDf(1.f, -1.f, 1.f, 1.f)) & BUILD_MASKF(1, 1, 0, 0);
-
-	mat.setTransformationCenter(center, translation);
-	return mat;
-}
-
-
-
-inline vectorSIMDf matrix3x4SIMD::doJob(const __m128& a, const matrix3x4SIMD& _mtx)
-{
-	__m128 r0 = _mtx.rows[0].getAsRegister();
-	__m128 r1 = _mtx.rows[1].getAsRegister();
-	__m128 r2 = _mtx.rows[2].getAsRegister();
-
-	const __m128i mask = _mm_setr_epi32(0, 0, 0, 0xffffffff);
-
-	vectorSIMDf res;
-	res = _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(0)), r0);
-	res += _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(1)), r1);
-	res += _mm_mul_ps(_mm_shuffle_ps(a, a, BROADCAST32(2)), r2);
-	res += vectorSIMDf(a) & mask; // always 0 0 0 a3 -- no shuffle needed
-	return res;
-	}
-
-inline __m128d matrix3x4SIMD::halfRowAsDouble(size_t _n, bool _0) const
-{
-	return _mm_cvtps_pd(_0 ? rows[_n].xyxx().getAsRegister() : rows[_n].zwxx().getAsRegister());
-}
-inline __m128d matrix3x4SIMD::doJob_d(const __m128d& _a0, const __m128d& _a1, const matrix3x4SIMD& _mtx, bool _xyHalf)
-{
-	__m128d r0 = _mtx.halfRowAsDouble(0u, _xyHalf);
-	__m128d r1 = _mtx.halfRowAsDouble(1u, _xyHalf);
-	__m128d r2 = _mtx.halfRowAsDouble(2u, _xyHalf);
-
-	const __m128d mask01 = _mm_castsi128_pd(_mm_setr_epi32(0, 0, 0xffffffff, 0xffffffff));
-
-	__m128d res;
-	res = _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 0), r0);
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 3), r1));
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 0), r2));
-	if (!_xyHalf)
-		res = _mm_add_pd(res, _mm_and_pd(_a1, mask01));
-	return res;
-}
-
-#undef BUILD_MASKF
-#undef BUILD_XORMASKF
-#undef BROADCAST32
-#else
-#error "no implementation"
-#endif
-
-} // nbl::core
-
-#endif
diff --git a/include/matrix4SIMD.h b/include/matrix4SIMD.h
deleted file mode 100644
index 03126c61f7..0000000000
--- a/include/matrix4SIMD.h
+++ /dev/null
@@ -1,385 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_MATRIX4SIMD_H_INCLUDED__
-#define __NBL_MATRIX4SIMD_H_INCLUDED__
-
-#include "matrix3x4SIMD.h"
-
-namespace nbl
-{
-namespace core
-{
-
-template<typename T>
-class aabbox3d;
-
-
-class matrix4SIMD// : public AlignedBase<_NBL_SIMD_ALIGNMENT> don't inherit from AlignedBase (which is empty) because member `rows[4]` inherits from it as well
-{
-	public:
-		_NBL_STATIC_INLINE_CONSTEXPR uint32_t VectorCount = 4u;
-		vectorSIMDf rows[VectorCount];
-
-		inline explicit matrix4SIMD(const vectorSIMDf& _r0 = vectorSIMDf(1.f, 0.f, 0.f, 0.f),
-									const vectorSIMDf& _r1 = vectorSIMDf(0.f, 1.f, 0.f, 0.f),
-									const vectorSIMDf& _r2 = vectorSIMDf(0.f, 0.f, 1.f, 0.f),
-									const vectorSIMDf& _r3 = vectorSIMDf(0.f, 0.f, 0.f, 1.f))
-											: rows{ _r0, _r1, _r2, _r3 }
-		{
-		}
-
-		inline matrix4SIMD(	float _a00, float _a01, float _a02, float _a03,
-							float _a10, float _a11, float _a12, float _a13,
-							float _a20, float _a21, float _a22, float _a23,
-							float _a30, float _a31, float _a32, float _a33)
-									: matrix4SIMD(	vectorSIMDf(_a00, _a01, _a02, _a03),
-													vectorSIMDf(_a10, _a11, _a12, _a13),
-													vectorSIMDf(_a20, _a21, _a22, _a23),
-													vectorSIMDf(_a30, _a31, _a32, _a33))
-		{
-		}
-
-		inline explicit matrix4SIMD(const float* const _data)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4 * i);
-		}
-		inline matrix4SIMD(const float* const _data, bool ALIGNED)
-		{
-			if (!_data)
-				return;
-			for (size_t i = 0u; i < VectorCount; ++i)
-				rows[i] = vectorSIMDf(_data + 4 * i, ALIGNED);
-		}
-
-		inline explicit matrix4SIMD(const matrix3x4SIMD& smallMat)
-		{
-			*reinterpret_cast<matrix3x4SIMD*>(this) = smallMat;
-			rows[3].set(0.f,0.f,0.f,1.f);
-		}
-
-		inline matrix3x4SIMD extractSub3x4() const
-		{
-			return matrix3x4SIMD(rows[0],rows[1],rows[2]);
-		}
-
-		//! Access by row
-		inline const vectorSIMDf& getRow(size_t _rown) const{ return rows[_rown]; }
-		inline vectorSIMDf& getRow(size_t _rown) { return rows[_rown]; }
-
-		//! Access by element
-		inline float operator()(size_t _i, size_t _j) const { return rows[_i].pointer[_j]; }
-		inline float& operator()(size_t _i, size_t _j) { return rows[_i].pointer[_j]; }
-
-		//! Access for memory
-		inline const float* pointer() const {return rows[0].pointer;}
-		inline float* pointer() {return rows[0].pointer;}
-
-
-		inline bool operator==(const matrix4SIMD& _other) const
-		{
-			return !(*this != _other);
-		}
-		inline bool operator!=(const matrix4SIMD& _other) const;
-
-		inline matrix4SIMD& operator+=(const matrix4SIMD& _other);
-		inline matrix4SIMD operator+(const matrix4SIMD& _other) const
-		{
-			matrix4SIMD r{*this};
-			return r += _other;
-		}
-
-		inline matrix4SIMD& operator-=(const matrix4SIMD& _other);
-		inline matrix4SIMD operator-(const matrix4SIMD& _other) const
-		{
-			matrix4SIMD r{*this};
-			return r -= _other;
-		}
-
-		inline matrix4SIMD& operator*=(float _scalar);
-		inline matrix4SIMD operator*(float _scalar) const
-		{
-			matrix4SIMD r{*this};
-			return r *= _scalar;
-		}
-
-		static inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b);
-		static inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b);
-
-		inline bool isIdentity() const
-		{
-			return *this == matrix4SIMD();
-		}
-		inline bool isIdentity(float _tolerance) const;
-
-		inline bool isOrthogonal() const
-		{
-			return concatenateBFollowedByA(transpose(*this), *this).isIdentity();
-		}
-		inline bool isOrthogonal(float _tolerance) const
-		{
-			return concatenateBFollowedByA(transpose(*this), *this).isIdentity(_tolerance);
-		}
-
-		inline matrix4SIMD& setScale(const core::vectorSIMDf& _scale);
-		inline matrix4SIMD& setScale(float _scale)
-		{
-			return setScale(vectorSIMDf(_scale));
-		}
-
-		inline void setTranslation(const float* _t)
-		{
-			for (size_t i = 0u; i < 3u; ++i)
-				rows[i].w = _t[i];
-		}
-		//! Takes into account only x,y,z components of _t
-		inline void setTranslation(const vectorSIMDf& _t)
-		{
-			setTranslation(_t.pointer);
-		}
-		inline void setTranslation(const vector3d<float>& _t)
-		{
-			setTranslation(&_t.X);
-		}
-
-		//! Returns last column of the matrix.
-		inline vectorSIMDf getTranslation() const;
-
-		//! Returns translation part of the matrix (w component is always 0).
-		inline vectorSIMDf getTranslation3D() const;
-		
-		enum class E_MATRIX_INVERSE_PRECISION
-		{
-		  EMIP_FAST_RECIPROCAL,
-		  EMIP_32BIT,
-		  EMIP_64BBIT
-		};
-
-		template<E_MATRIX_INVERSE_PRECISION precision = E_MATRIX_INVERSE_PRECISION::EMIP_FAST_RECIPROCAL>
-		inline bool getInverseTransform(matrix4SIMD& _out) const
-		{
-			if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_64BBIT)
-			{
-				double a = rows[0][0], b = rows[0][1], c = rows[0][2], d = rows[0][3];
-				double e = rows[1][0], f = rows[1][1], g = rows[1][2], h = rows[1][3];
-				double i = rows[2][0], j = rows[2][1], k = rows[2][2], l = rows[2][3];
-				double m = rows[3][0], n = rows[3][1], o = rows[3][2], p = rows[3][3];
-
-				double kp_lo = k * p - l * o;
-				double jp_ln = j * p - l * n;
-				double jo_kn = j * o - k * n;
-				double ip_lm = i * p - l * m;
-				double io_km = i * o - k * m;
-				double in_jm = i * n - j * m;
-
-				double a11 = +(f * kp_lo - g * jp_ln + h * jo_kn);
-				double a12 = -(e * kp_lo - g * ip_lm + h * io_km);
-				double a13 = +(e * jp_ln - f * ip_lm + h * in_jm);
-				double a14 = -(e * jo_kn - f * io_km + g * in_jm);
-
-				double det = a * a11 + b * a12 + c * a13 + d * a14;
-
-				if (core::iszero(det, DBL_MIN))
-					return false;
-
-				double invDet = 1.0 / det;
-
-				_out.rows[0][0] = a11 * invDet;
-				_out.rows[1][0] = a12 * invDet;
-				_out.rows[2][0] = a13 * invDet;
-				_out.rows[3][0] = a14 * invDet;
-
-				_out.rows[0][1] = -(b * kp_lo - c * jp_ln + d * jo_kn) * invDet;
-				_out.rows[1][1] = +(a * kp_lo - c * ip_lm + d * io_km) * invDet;
-				_out.rows[2][1] = -(a * jp_ln - b * ip_lm + d * in_jm) * invDet;
-				_out.rows[3][1] = +(a * jo_kn - b * io_km + c * in_jm) * invDet;
-
-				double gp_ho = g * p - h * o;
-				double fp_hn = f * p - h * n;
-				double fo_gn = f * o - g * n;
-				double ep_hm = e * p - h * m;
-				double eo_gm = e * o - g * m;
-				double en_fm = e * n - f * m;
-
-				_out.rows[0][2] = +(b * gp_ho - c * fp_hn + d * fo_gn) * invDet;
-				_out.rows[1][2] = -(a * gp_ho - c * ep_hm + d * eo_gm) * invDet;
-				_out.rows[2][2] = +(a * fp_hn - b * ep_hm + d * en_fm) * invDet;
-				_out.rows[3][2] = -(a * fo_gn - b * eo_gm + c * en_fm) * invDet;
-
-				double gl_hk = g * l - h * k;
-				double fl_hj = f * l - h * j;
-				double fk_gj = f * k - g * j;
-				double el_hi = e * l - h * i;
-				double ek_gi = e * k - g * i;
-				double ej_fi = e * j - f * i;
-
-				_out.rows[0][3] = -(b * gl_hk - c * fl_hj + d * fk_gj) * invDet;
-				_out.rows[1][3] = +(a * gl_hk - c * el_hi + d * ek_gi) * invDet;
-				_out.rows[2][3] = -(a * fl_hj - b * el_hi + d * ej_fi) * invDet;
-				_out.rows[3][3] = +(a * fk_gj - b * ek_gi + c * ej_fi) * invDet;
-
-				return true;
-			}
-			else
-			{
-				auto mat2mul = [](vectorSIMDf _A, vectorSIMDf _B)
-				{
-					return _A*_B.xwxw()+_A.yxwz()*_B.zyzy();
-				};
-				auto mat2adjmul = [](vectorSIMDf _A, vectorSIMDf _B)
-				{
-					return _A.wwxx()*_B-_A.yyzz()*_B.zwxy();
-				};
-				auto mat2muladj = [](vectorSIMDf _A, vectorSIMDf _B)
-				{
-					return _A*_B.wxwx()-_A.yxwz()*_B.zyzy();
-				};
-
-				vectorSIMDf A = _mm_movelh_ps(rows[0].getAsRegister(), rows[1].getAsRegister());
-				vectorSIMDf B = _mm_movehl_ps(rows[1].getAsRegister(), rows[0].getAsRegister());
-				vectorSIMDf C = _mm_movelh_ps(rows[2].getAsRegister(), rows[3].getAsRegister());
-				vectorSIMDf D = _mm_movehl_ps(rows[3].getAsRegister(), rows[2].getAsRegister());
-
-				vectorSIMDf allDets =	vectorSIMDf(_mm_shuffle_ps(rows[0].getAsRegister(),rows[2].getAsRegister(),_MM_SHUFFLE(2,0,2,0)))*
-										vectorSIMDf(_mm_shuffle_ps(rows[1].getAsRegister(),rows[3].getAsRegister(),_MM_SHUFFLE(3,1,3,1)))
-									-
-										vectorSIMDf(_mm_shuffle_ps(rows[0].getAsRegister(),rows[2].getAsRegister(),_MM_SHUFFLE(3,1,3,1)))*
-										vectorSIMDf(_mm_shuffle_ps(rows[1].getAsRegister(),rows[3].getAsRegister(),_MM_SHUFFLE(2,0,2,0)));
-
-				auto detA = allDets.xxxx();
-				auto detB = allDets.yyyy();
-				auto detC = allDets.zzzz();
-				auto detD = allDets.wwww();
-
-				// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
-				auto D_C = mat2adjmul(D, C);
-				// A#B
-				auto A_B = mat2adjmul(A, B);
-				// X# = |D|A - B(D#C)
-				auto X_ = detD*A - mat2mul(B, D_C);
-				// W# = |A|D - C(A#B)
-				auto W_ = detA*D - mat2mul(C, A_B);
-
-				// |M| = |A|*|D| + ... (continue later)
-				auto detM = detA*detD;
-
-				// Y# = |B|C - D(A#B)#
-				auto Y_ = detB*C - mat2muladj(D, A_B);
-				// Z# = |C|B - A(D#C)#
-				auto Z_ = detC*B -  mat2muladj(A, D_C);
-
-				// |M| = |A|*|D| + |B|*|C| ... (continue later)
-				detM += detB*detC;
-
-				// tr((A#B)(D#C))
-				__m128 tr = (A_B*D_C.xzyw()).getAsRegister();
-				tr = _mm_hadd_ps(tr, tr);
-				tr = _mm_hadd_ps(tr, tr);
-				// |M| = |A|*|D| + |B|*|C| - tr((A#B)(D#C)
-				detM -= tr;
-
-				if (core::iszero(detM.x, FLT_MIN))
-					return false;
-
-				vectorSIMDf rDetM;
-
-				// (1/|M|, -1/|M|, -1/|M|, 1/|M|)
-				if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_FAST_RECIPROCAL)
-					rDetM = vectorSIMDf(1.f, -1.f, -1.f, 1.f)*core::reciprocal(detM);
-				else if constexpr (precision == E_MATRIX_INVERSE_PRECISION::EMIP_32BIT)
-					rDetM = vectorSIMDf(1.f, -1.f, -1.f, 1.f).preciseDivision(detM);
-
-				X_ *= rDetM;
-				Y_ *= rDetM;
-				Z_ *= rDetM;
-				W_ *= rDetM;
-
-				// apply adjugate and store, here we combine adjugate shuffle and store shuffle
-				_out.rows[0] = _mm_shuffle_ps(X_.getAsRegister(), Y_.getAsRegister(), _MM_SHUFFLE(1, 3, 1, 3));
-				_out.rows[1] = _mm_shuffle_ps(X_.getAsRegister(), Y_.getAsRegister(), _MM_SHUFFLE(0, 2, 0, 2));
-				_out.rows[2] = _mm_shuffle_ps(Z_.getAsRegister(), W_.getAsRegister(), _MM_SHUFFLE(1, 3, 1, 3));
-				_out.rows[3] = _mm_shuffle_ps(Z_.getAsRegister(), W_.getAsRegister(), _MM_SHUFFLE(0, 2, 0, 2));
-
-				return true;
-			}
-		}
-
-		inline vectorSIMDf sub3x3TransformVect(const vectorSIMDf& _in) const;
-
-		inline void transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const;
-		inline void transformVect(vectorSIMDf& _vector) const
-		{
-			transformVect(_vector, _vector);
-		}
-
-		inline void translateVect(vectorSIMDf& _vect) const
-		{
-			_vect += getTranslation();
-		}
-
-		bool isBoxInFrustum(const aabbox3d<float>& bbox);
-
-		bool perspectiveTransformVect(core::vectorSIMDf& inOutVec)
-		{
-			transformVect(inOutVec);
-			const bool inFront = inOutVec[3] > 0.f;
-			inOutVec /= inOutVec.wwww();
-			return inFront;
-		}
-
-		core::vector2di fragCoordTransformVect(const core::vectorSIMDf& _in, const core::dimension2du& viewportDimensions)
-		{
-			core::vectorSIMDf pos(_in);
-			pos.w = 1.f;
-			if (perspectiveTransformVect(pos))
-				core::vector2di(-0x80000000, -0x80000000);
-
-			pos[0] *= 0.5f;
-			pos[1] *= 0.5f;
-			pos[0] += 0.5f;
-			pos[1] += 0.5f;
-
-			return core::vector2di(pos[0] * float(viewportDimensions.Width), pos[1] * float(viewportDimensions.Height));
-		}
-
-		static inline matrix4SIMD buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar);
-		static inline matrix4SIMD buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar);
-
-		static inline matrix4SIMD buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar);
-		static inline matrix4SIMD buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar);
-
-		//! Access by row
-		inline const vectorSIMDf& operator[](size_t _rown) const { return rows[_rown]; }
-		//! Access by row
-		inline vectorSIMDf& operator[](size_t _rown) { return rows[_rown]; }
-
-	private:
-		//! TODO: implement a dvec<2>
-		inline __m128d halfRowAsDouble(size_t _n, bool _firstHalf) const;
-		static inline __m128d concat64_helper(const __m128d& _a0, const __m128d& _a1, const matrix4SIMD& _mtx, bool _firstHalf);
-};
-
-inline matrix4SIMD operator*(float _scalar, const matrix4SIMD& _mtx)
-{
-    return _mtx * _scalar;
-}
-
-inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-    return matrix4SIMD::concatenateBFollowedByA(_a, _b);
-}
-/*
-inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-    return matrix4SIMD::concatenateBFollowedByAPrecisely(_a, _b);
-}
-*/
-
-
-}} // nbl::core
-
-#endif
diff --git a/include/matrix4SIMD_impl.h b/include/matrix4SIMD_impl.h
deleted file mode 100644
index 02484e7a4c..0000000000
--- a/include/matrix4SIMD_impl.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef __NBL_MATRIX4SIMD_IMPL_H_INCLUDED__
-#define __NBL_MATRIX4SIMD_IMPL_H_INCLUDED__
-
-#include "matrix4SIMD.h"
-#include "nbl/core/math/glslFunctions.tcc"
-#include "aabbox3d.h"
-
-namespace nbl
-{
-namespace core
-{
-
-
-inline bool matrix4SIMD::operator!=(const matrix4SIMD& _other) const
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		if ((rows[i] != _other.rows[i]).any())
-			return true;
-	return false;
-}
-
-inline matrix4SIMD& matrix4SIMD::operator+=(const matrix4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] += _other.rows[i];
-	return *this;
-}
-
-inline matrix4SIMD& matrix4SIMD::operator-=(const matrix4SIMD& _other)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] -= _other.rows[i];
-	return *this;
-}
-
-inline matrix4SIMD& matrix4SIMD::operator*=(float _scalar)
-{
-	for (size_t i = 0u; i < VectorCount; ++i)
-		rows[i] *= _scalar;
-	return *this;
-}
-
-inline bool matrix4SIMD::isIdentity(float _tolerance) const
-{
-	return core::equals<matrix4SIMD>(*this, matrix4SIMD(), core::ROUNDING_ERROR<matrix4SIMD>());
-}
-
-#ifdef __NBL_COMPILE_WITH_SSE3
-#define BROADCAST32(fpx) _MM_SHUFFLE(fpx, fpx, fpx, fpx)
-#define BUILD_MASKF(_x_, _y_, _z_, _w_) _mm_setr_epi32(_x_*0xffffffff, _y_*0xffffffff, _z_*0xffffffff, _w_*0xffffffff)
-inline matrix4SIMD matrix4SIMD::concatenateBFollowedByA(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-	auto calcRow = [](const __m128& _row, const matrix4SIMD& _mtx)
-	{
-		__m128 r0 = _mtx.rows[0].getAsRegister();
-		__m128 r1 = _mtx.rows[1].getAsRegister();
-		__m128 r2 = _mtx.rows[2].getAsRegister();
-		__m128 r3 = _mtx.rows[3].getAsRegister();
-
-		__m128 res;
-		res = _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(0)), r0);
-		res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(1)), r1));
-		res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(2)), r2));
-		res = _mm_add_ps(res, _mm_mul_ps(_mm_shuffle_ps(_row, _row, BROADCAST32(3)), r3));
-		return res;
-	};
-
-	matrix4SIMD r;
-	for (size_t i = 0u; i < 4u; ++i)
-		r.rows[i] = calcRow(_a.rows[i].getAsRegister(), _b);
-
-	return r;
-}
-inline matrix4SIMD matrix4SIMD::concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix4SIMD& _b)
-{
-	matrix4SIMD out;
-
-	__m128i mask0011 = BUILD_MASKF(0, 0, 1, 1);
-	__m128 second;
-
-	{
-	__m128d r00 = _a.halfRowAsDouble(0u, true);
-	__m128d r01 = _a.halfRowAsDouble(0u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r00, r01, _b, false));
-	out.rows[0] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r00, r01, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	{
-	__m128d r10 = _a.halfRowAsDouble(1u, true);
-	__m128d r11 = _a.halfRowAsDouble(1u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r10, r11, _b, false));
-	out.rows[1] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r10, r11, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	{
-	__m128d r20 = _a.halfRowAsDouble(2u, true);
-	__m128d r21 = _a.halfRowAsDouble(2u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r20, r21, _b, false));
-	out.rows[2] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r20, r21, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	{
-	__m128d r30 = _a.halfRowAsDouble(3u, true);
-	__m128d r31 = _a.halfRowAsDouble(3u, false);
-	second = _mm_cvtpd_ps(concat64_helper(r30, r31, _b, false));
-	out.rows[3] = vectorSIMDf(_mm_cvtpd_ps(concat64_helper(r30, r31, _b, true))) | _mm_castps_si128((vectorSIMDf(_mm_movelh_ps(second, second)) & mask0011).getAsRegister());
-	}
-
-	return out;
-}
-
-inline matrix4SIMD& matrix4SIMD::setScale(const core::vectorSIMDf& _scale)
-{
-	const __m128i mask0001 = BUILD_MASKF(0, 0, 0, 1);
-
-	rows[0] = (_scale & BUILD_MASKF(1, 0, 0, 0)) | _mm_castps_si128((rows[0] & mask0001).getAsRegister());
-	rows[1] = (_scale & BUILD_MASKF(0, 1, 0, 0)) | _mm_castps_si128((rows[1] & mask0001).getAsRegister());
-	rows[2] = (_scale & BUILD_MASKF(0, 0, 1, 0)) | _mm_castps_si128((rows[2] & mask0001).getAsRegister());
-	rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f);
-
-	return *this;
-}
-
-//! Returns last column of the matrix.
-inline vectorSIMDf matrix4SIMD::getTranslation() const
-{
-	__m128 tmp1 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 tmp2 = _mm_unpackhi_ps(rows[2].getAsRegister(), rows[3].getAsRegister()); // (2z,3z,2w,3w)
-	__m128 col3 = _mm_movehl_ps(tmp1, tmp2);// (0w,1w,2w,3w)
-
-	return col3;
-}
-//! Returns translation part of the matrix (w component is always 0).
-inline vectorSIMDf matrix4SIMD::getTranslation3D() const
-{
-	__m128 tmp1 = _mm_unpackhi_ps(rows[0].getAsRegister(), rows[1].getAsRegister()); // (0z,1z,0w,1w)
-	__m128 tmp2 = _mm_unpackhi_ps(rows[2].getAsRegister(), _mm_setzero_ps()); // (2z,0,2w,0)
-	__m128 transl = _mm_movehl_ps(tmp1, tmp2);// (0w,1w,2w,0)
-
-	return transl;
-}
-
-inline vectorSIMDf matrix4SIMD::sub3x3TransformVect(const vectorSIMDf& _in) const
-{
-	matrix4SIMD cp{*this};
-	vectorSIMDf out = _in & BUILD_MASKF(1, 1, 1, 0);
-	transformVect(out);
-	return out;
-}
-
-inline void matrix4SIMD::transformVect(vectorSIMDf& _out, const vectorSIMDf& _in) const
-{
-	vectorSIMDf r[4];
-	for (size_t i = 0u; i < VectorCount; ++i)
-		r[i] = rows[i] * _in;
-
-	_out = _mm_hadd_ps(
-		_mm_hadd_ps(r[0].getAsRegister(), r[1].getAsRegister()),
-		_mm_hadd_ps(r[2].getAsRegister(), r[3].getAsRegister())
-	);
-}
-
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
-{
-	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians*0.5f));
-	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
-	const float w = h / aspectRatio;
-
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(w, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -h, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, -zFar/(zFar-zNear), -zNear*zFar/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, -1.f, 0.f);
-
-	return m;
-}
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
-{
-	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians*0.5f));
-	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
-	const float w = h / aspectRatio;
-
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(w, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -h, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, zFar/(zFar-zNear), -zNear*zFar/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, 1.f, 0.f);
-
-	return m;
-}
-
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
-{
-	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(2.f/widthOfViewVolume, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -2.f/heightOfViewVolume, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, -1.f/(zFar-zNear), -zNear/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f);
-
-	return m;
-}
-inline matrix4SIMD matrix4SIMD::buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
-{
-	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
-	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
-
-	matrix4SIMD m;
-	m.rows[0] = vectorSIMDf(2.f/widthOfViewVolume, 0.f, 0.f, 0.f);
-	m.rows[1] = vectorSIMDf(0.f, -2.f/heightOfViewVolume, 0.f, 0.f);
-	m.rows[2] = vectorSIMDf(0.f, 0.f, 1.f/(zFar-zNear), -zNear/(zFar-zNear));
-	m.rows[3] = vectorSIMDf(0.f, 0.f, 0.f, 1.f);
-
-	return m;
-}
-
-
-
-inline __m128d matrix4SIMD::halfRowAsDouble(size_t _n, bool _firstHalf) const
-{
-	return _mm_cvtps_pd(_firstHalf ? rows[_n].xyxx().getAsRegister() : rows[_n].zwxx().getAsRegister());
-}
-inline __m128d matrix4SIMD::concat64_helper(const __m128d& _a0, const __m128d& _a1, const matrix4SIMD& _mtx, bool _firstHalf)
-{
-	__m128d r0 = _mtx.halfRowAsDouble(0u, _firstHalf);
-	__m128d r1 = _mtx.halfRowAsDouble(1u, _firstHalf);
-	__m128d r2 = _mtx.halfRowAsDouble(2u, _firstHalf);
-	__m128d r3 = _mtx.halfRowAsDouble(3u, _firstHalf);
-
-	//const __m128d mask01 = _mm_castsi128_pd(_mm_setr_epi32(0, 0, 0xffffffff, 0xffffffff));
-
-	__m128d res;
-	res = _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 0), r0);
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a0, _a0, 3/*0b11*/), r1));
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 0), r2));
-	res = _mm_add_pd(res, _mm_mul_pd(_mm_shuffle_pd(_a1, _a1, 3/*0b11*/), r3));
-	return res;
-}
-
-#undef BUILD_MASKF
-#undef BROADCAST32
-#else
-#error "no implementation"
-#endif
-
-inline bool matrix4SIMD::isBoxInFrustum(const aabbox3d<float>& bbox)
-{
-	vectorSIMDf MinEdge, MaxEdge;
-	MinEdge.set(bbox.MinEdge);
-	MaxEdge.set(bbox.MaxEdge);
-	MinEdge.w = 1.f;
-	MaxEdge.w = 1.f;
-
-
-	auto getClosestDP = [&MinEdge,&MaxEdge](const vectorSIMDf& toDot) -> float
-	{
-		return dot(mix(MaxEdge,MinEdge,toDot<vectorSIMDf(0.f)),toDot)[0];
-	};
-
-	// near plane
-	if (getClosestDP(rows[3])<=0.f)
-		return false;
-
-	// x max
-	if (getClosestDP(rows[3]+rows[0])<=0.f)
-		return false;
-	// y max
-	if (getClosestDP(rows[3]+rows[1])<=0.f)
-		return false;
-	// x min
-	if (getClosestDP(rows[3]-rows[0])<=0.f)
-		return false;
-	// y min
-	if (getClosestDP(rows[3]-rows[1])<=0.f)
-		return false;
-
-	// far plane
-	if (getClosestDP(rows[3]+rows[2])<=0.f)
-		return false;
-
-	return true;
-}
-
-}
-} // nbl::core
-
-#endif
diff --git a/include/nabla.h b/include/nabla.h
index dac3155e5e..c3981635ea 100644
--- a/include/nabla.h
+++ b/include/nabla.h
@@ -53,9 +53,8 @@
 #include "vector3d.h"
 #include "vectorSIMD.h"
 #include "line3d.h"
-#include "matrix4SIMD.h"
 #include "position2d.h"
-#include "quaternion.h"
+#include "nbl/builtin/hlsl/math/quaternion/quaternion.hlsl"
 #include "rect.h"
 #include "dimension2d.h"
 
diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index 94e53238b2..889a5ce626 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -10,6 +10,7 @@
 #include <compare>
 
 #include "nbl/builtin/hlsl/cpp_compat/matrix.hlsl"
+#include "nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl"
 
 #include "nbl/asset/ECommonEnums.h"
 #include "nbl/asset/IDescriptor.h"
@@ -181,7 +182,7 @@ class ITopLevelAccelerationStructure : public AccelerationStructure
 			FORCE_OPACITY_MICROMAP_2_STATE_BIT = 0x1u<<4u,
 			FORCE_DISABLE_OPACITY_MICROMAPS_BIT = 0x1u<<5u,
 		};
-		// Note: `core::matrix3x4SIMD` is equvalent to VkTransformMatrixKHR, 4x3 row_major matrix
+		// Note: `hlsl::float32_t3x4` is equvalent to VkTransformMatrixKHR, 4x3 row_major matrix
 		template<typename blas_ref_t>
 		struct Instance final
 		{
@@ -197,18 +198,18 @@ class ITopLevelAccelerationStructure : public AccelerationStructure
 		template<typename blas_ref_t>
 		struct StaticInstance final
 		{
-			core::matrix3x4SIMD	transform = core::matrix3x4SIMD();
+			hlsl::float32_t3x4 transform = hlsl::diagonal<hlsl::float32_t3x4>(1.0f);
 			Instance<blas_ref_t> base = {};
 		};
 		template<typename blas_ref_t>
 		struct MatrixMotionInstance final
 		{
-			core::matrix3x4SIMD transform[2] = {core::matrix3x4SIMD(),core::matrix3x4SIMD()};
+			hlsl::float32_t3x4 transform[2] = { hlsl::diagonal<hlsl::float32_t3x4>(1.0f), hlsl::diagonal<hlsl::float32_t3x4>(1.0f) };
 			Instance<blas_ref_t> base = {};
 		};
 		struct SRT
 		{
-			// TODO: some operators to convert back and forth from `core::matrix3x4SIMD
+			// TODO: some operators to convert back and forth from `hlsl::float32_t3x4
 
 			float    sx;
 			float    a;
diff --git a/include/nbl/asset/IAnimationLibrary.h b/include/nbl/asset/IAnimationLibrary.h
index 9665349103..d650cb25d9 100644
--- a/include/nbl/asset/IAnimationLibrary.h
+++ b/include/nbl/asset/IAnimationLibrary.h
@@ -34,7 +34,7 @@ class IAnimationLibrary : public virtual core::IReferenceCounted
 					translation[2] = translation[1] = translation[0] = 0.f;
 					quat = core::vectorSIMDu32(128u,128u,128u,255u); // should be (0,0,0,1) encoded
 				}
-				Keyframe(const core::vectorSIMDf& _scale, const core::quaternion& _quat, const CQuantQuaternionCache* quantCache, const core::vectorSIMDf& _translation)
+				Keyframe(const core::vectorSIMDf& _scale, const hlsl::quaternion<float>& _quat, const CQuantQuaternionCache* quantCache, const core::vectorSIMDf& _translation)
 				{
 					std::copy(_translation.pointer,_translation.pointer+3,translation);
 					quat = quantCache->template quantize<decltype(quat)>(_quat);
@@ -42,13 +42,13 @@ class IAnimationLibrary : public virtual core::IReferenceCounted
 					//scale = ;
 				}
 
-				inline core::quaternion getRotation() const
+				inline hlsl::quaternion<float> getRotation() const
 				{
 					const void* _pix[4] = {&quat,nullptr,nullptr,nullptr};
 					double out[4];
 					decodePixels<EF_R8G8B8A8_SNORM,double>(_pix,out,0u,0u);
 					auto q = core::normalize(core::vectorSIMDf(out[0],out[1],out[2],out[3]));
-					return reinterpret_cast<const core::quaternion*>(&q)[0];
+					return reinterpret_cast<const hlsl::quaternion<float>*>(&q)[0];
 				}
 
 				inline core::vectorSIMDf getScale() const
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 532b622090..c6fce408ab 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -582,18 +582,18 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
         }
 
         //!
-        inline const core::matrix3x4SIMD* getInverseBindPoses() const
+        inline const hlsl::float32_t3x4* getInverseBindPoses() const
         {
             if (!m_inverseBindPoseBufferBinding.buffer)
                 return nullptr;
 
             const uint8_t* ptr = reinterpret_cast<const uint8_t*>(m_inverseBindPoseBufferBinding.buffer->getPointer());
-            return reinterpret_cast<const core::matrix3x4SIMD*>(ptr+m_inverseBindPoseBufferBinding.offset);
+            return reinterpret_cast<const hlsl::float32_t3x4*>(ptr+m_inverseBindPoseBufferBinding.offset);
         }
-        inline core::matrix3x4SIMD* getInverseBindPoses()
+        inline hlsl::float32_t3x4* getInverseBindPoses()
         {
             assert(isMutable());
-            return const_cast<core::matrix3x4SIMD*>(const_cast<const ICPUMeshBuffer*>(this)->getInverseBindPoses());
+            return const_cast<hlsl::float32_t3x4*>(const_cast<const ICPUMeshBuffer*>(this)->getInverseBindPoses());
         }
 
         //!
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index 6f1c576ed8..53d7e66be0 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -42,15 +42,15 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 		}
 
 		//!
-		inline const core::matrix3x4SIMD& getDefaultTransformMatrix(base_t::joint_id_t jointID) const
+		inline const hlsl::float32_t3x4& getDefaultTransformMatrix(base_t::joint_id_t jointID) const
 		{
 			const uint8_t* ptr = reinterpret_cast<const uint8_t*>(m_defaultTransforms.buffer->getPointer());
-			return reinterpret_cast<const core::matrix3x4SIMD*>(ptr+m_defaultTransforms.offset)[jointID];
+			return reinterpret_cast<const hlsl::float32_t3x4*>(ptr+m_defaultTransforms.offset)[jointID];
 		}
-		inline core::matrix3x4SIMD& getDefaultTransformMatrix(base_t::joint_id_t jointID)
+		inline hlsl::float32_t3x4& getDefaultTransformMatrix(base_t::joint_id_t jointID)
 		{
 			assert(isMutable());
-			return const_cast<core::matrix3x4SIMD&>(const_cast<const ICPUSkeleton*>(this)->getDefaultTransformMatrix(jointID));
+			return const_cast<hlsl::float32_t3x4&>(const_cast<const ICPUSkeleton*>(this)->getDefaultTransformMatrix(jointID));
 		}
 
 		//!
diff --git a/include/nbl/asset/IMeshBuffer.h b/include/nbl/asset/IMeshBuffer.h
index a4f1b895dc..c68fe3408f 100644
--- a/include/nbl/asset/IMeshBuffer.h
+++ b/include/nbl/asset/IMeshBuffer.h
@@ -210,7 +210,7 @@ class IMeshBuffer : public virtual core::IReferenceCounted
         virtual inline bool isSkinned() const
         {
             return  jointCount>0u && maxJointsPerVx>0u && m_inverseBindPoseBufferBinding.buffer &&
-                    m_inverseBindPoseBufferBinding.offset+jointCount*sizeof(core::matrix3x4SIMD)<=m_inverseBindPoseBufferBinding.buffer->getSize();
+                    m_inverseBindPoseBufferBinding.offset+jointCount*sizeof(hlsl::float32_t3x4)<=m_inverseBindPoseBufferBinding.buffer->getSize();
         }
 
         //!
@@ -227,7 +227,7 @@ class IMeshBuffer : public virtual core::IReferenceCounted
             if (_maxJointsPerVx==0u || _maxJointsPerVx>4u)
                 return false;
 
-            if (_inverseBindPoseBufferBinding.offset+_jointCount*sizeof(core::matrix3x4SIMD)>_inverseBindPoseBufferBinding.buffer->getSize())
+            if (_inverseBindPoseBufferBinding.offset+_jointCount*sizeof(hlsl::float32_t3x4)>_inverseBindPoseBufferBinding.buffer->getSize())
                 return false;
 
             m_inverseBindPoseBufferBinding = std::move(_inverseBindPoseBufferBinding);
diff --git a/include/nbl/asset/ISkeleton.h b/include/nbl/asset/ISkeleton.h
index 7960ca4eef..03ba3af4ea 100644
--- a/include/nbl/asset/ISkeleton.h
+++ b/include/nbl/asset/ISkeleton.h
@@ -62,7 +62,7 @@ class ISkeleton : public virtual core::IReferenceCounted
 				return;
 
 			assert(m_parentJointIDs.buffer->getSize()>=m_parentJointIDs.offset+sizeof(joint_id_t)*m_jointCount);
-			assert(m_defaultTransforms.buffer->getSize()>=m_defaultTransforms.offset+sizeof(core::matrix3x4SIMD)*m_jointCount);
+			assert(m_defaultTransforms.buffer->getSize()>=m_defaultTransforms.offset+sizeof(hlsl::float32_t3x4)*m_jointCount);
 		}
 		virtual ~ISkeleton()
 		{
diff --git a/include/nbl/asset/asset_utils.h b/include/nbl/asset/asset_utils.h
index 8e4e35a733..84c8a8df45 100644
--- a/include/nbl/asset/asset_utils.h
+++ b/include/nbl/asset/asset_utils.h
@@ -31,7 +31,7 @@ inline void fillBufferWithDeadBeef(ICPUBuffer* _buf)
 
 #include "nbl/nblpack.h"
 //! Designed for use with interface blocks declared with `layout (row_major, std140)`
-// TODO: change members to core::matrix3x4SIMD and core::matrix4SIMD
+// TODO: change members to hlsl::float32_t3x4 and hlsl::float32_t4x4
 struct SBasicViewParameters
 {
     float MVP[4*4];
diff --git a/include/nbl/asset/metadata/IMeshMetadata.h b/include/nbl/asset/metadata/IMeshMetadata.h
index 5ce3c12980..25f72e05c0 100644
--- a/include/nbl/asset/metadata/IMeshMetadata.h
+++ b/include/nbl/asset/metadata/IMeshMetadata.h
@@ -18,7 +18,7 @@ class IMeshMetadata : public core::Interface
 	public:
 		struct SInstance
 		{
-			core::matrix3x4SIMD worldTform;
+			hlsl::float32_t3x4 worldTform;
 		};
 		core::SRange<const SInstance> m_instances;
 
diff --git a/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h b/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h
index 416c04823b..7d0b63a141 100644
--- a/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h
+++ b/include/nbl/asset/metadata/IRenderpassIndependentPipelineMetadata.h
@@ -140,35 +140,35 @@ class IRenderpassIndependentPipelineMetadata : public core::Interface
 		//! A non exhaustive list of commonly used shader input semantics
 		enum E_COMMON_SHADER_INPUT
 		{
-			//! core::matrix4SIMD giving the total projection onto the screen from model-space coordinates
+			//! hlsl::float32_t4x4 giving the total projection onto the screen from model-space coordinates
 			ECSI_WORLD_VIEW_PROJ,
-			//! core::matrix4SIMD giving the mapping from view-space into the pre-divide NDC space
+			//! hlsl::float32_t4x4 giving the mapping from view-space into the pre-divide NDC space
 			ECSI_PROJ,
-			//! core::matrix3x4SIMD giving the view-space transformation from model-space coordinates
+			//! hlsl::float32_t3x4 giving the view-space transformation from model-space coordinates
 			ECSI_WORLD_VIEW,
-			//! core::matrix3x4SIMD giving the view-space transformation from world-space
+			//! hlsl::float32_t3x4 giving the view-space transformation from world-space
 			ECSI_VIEW,
-			//! core::matrix3x4SIMD giving the world-space transformation from model-space (last column is object world-space-position)
+			//! hlsl::float32_t3x4 giving the world-space transformation from model-space (last column is object world-space-position)
 			ECSI_WORLD,
-			//! core::matrix4SIMD giving the total projection to model-space coordinates from screen-space
+			//! hlsl::float32_t4x4 giving the total projection to model-space coordinates from screen-space
 			ECSI_WORLD_VIEW_PROJ_INVERSE,
-			//! core::matrix4SIMD giving the mapping from the pre-divide NDC space into view-space
+			//! hlsl::float32_t4x4 giving the mapping from the pre-divide NDC space into view-space
 			ECSI_PROJ_INVERSE,
-			//! core::matrix3x4SIMD giving the model-space transformation from view-space coordinates
+			//! hlsl::float32_t3x4 giving the model-space transformation from view-space coordinates
 			ECSI_WORLD_VIEW_INVERSE,
-			//! core::matrix3x4SIMD giving the world-space transformation from view-space (last column is camera world-space-position)
+			//! hlsl::float32_t3x4 giving the world-space transformation from view-space (last column is camera world-space-position)
 			ECSI_VIEW_INVERSE,
-			//! core::matrix3x4SIMD giving the model-space transformation from world-space
+			//! hlsl::float32_t3x4 giving the model-space transformation from world-space
 			ECSI_WORLD_INVERSE,
-			//! transpose of core::matrix4SIMD giving the total projection to model-space coordinates from screen-space
+			//! transpose of hlsl::float32_t4x4 giving the total projection to model-space coordinates from screen-space
 			ECSI_WORLD_VIEW_PROJ_INVERSE_TRANSPOSE,
-			//! transpose of core::matrix4SIMD giving the mapping from the pre-divide NDC space into view-space
+			//! transpose of hlsl::float32_t4x4 giving the mapping from the pre-divide NDC space into view-space
 			ECSI_PROJ_INVERSE_TRANSPOSE,
-			//! transpose of core::matrix3x4SIMD giving the model-space transformation from view-space coordinates (upper 3x3 matrix can be used instead of `gl_NormalMatrix`)
+			//! transpose of hlsl::float32_t3x4 giving the model-space transformation from view-space coordinates (upper 3x3 matrix can be used instead of `gl_NormalMatrix`)
 			ECSI_WORLD_VIEW_INVERSE_TRANSPOSE,
-			//! transpose of core::matrix3x4SIMD giving the world-space transformation from view-space (last row is camera world-space-position)
+			//! transpose of hlsl::float32_t3x4 giving the world-space transformation from view-space (last row is camera world-space-position)
 			ECSI_VIEW_INVERSE_TRANSPOSE,
-			//! transpose of core::matrix3x4SIMD giving the model-space transformation from world-space (upper 3x3 matrix can transform model space normals to world space)
+			//! transpose of hlsl::float32_t3x4 giving the model-space transformation from world-space (upper 3x3 matrix can transform model space normals to world space)
 			ECSI_WORLD_INVERSE_TRANSPOSE,
 
 			//! a simple non-filtered environment map as a cubemap
diff --git a/include/nbl/asset/utils/CQuantQuaternionCache.h b/include/nbl/asset/utils/CQuantQuaternionCache.h
index 8e46dffb0a..a51549d24d 100644
--- a/include/nbl/asset/utils/CQuantQuaternionCache.h
+++ b/include/nbl/asset/utils/CQuantQuaternionCache.h
@@ -60,7 +60,7 @@ class CQuantQuaternionCache : public CDirQuantCacheBase<impl::Projection,impl::Q
 
 	public:
 		template<E_FORMAT CacheFormat>
-		value_type_t<CacheFormat> quantize(const core::quaternion& quat)
+		value_type_t<CacheFormat> quantize(const hlsl::quaternion<float>& quat)
 		{
 			return Base::quantize<4u,CacheFormat>(reinterpret_cast<const core::vectorSIMDf&>(quat));
 		}
diff --git a/include/nbl/asset/utils/IMeshManipulator.h b/include/nbl/asset/utils/IMeshManipulator.h
index f84d85c75d..6aff59200c 100644
--- a/include/nbl/asset/utils/IMeshManipulator.h
+++ b/include/nbl/asset/utils/IMeshManipulator.h
@@ -18,6 +18,9 @@
 #include "nbl/asset/utils/CQuantNormalCache.h"
 #include "nbl/asset/utils/CQuantQuaternionCache.h"
 
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include <nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl>
+
 namespace nbl
 {
 namespace asset
@@ -351,7 +354,7 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted
 
 		static float DistanceToLine(core::vectorSIMDf P0, core::vectorSIMDf P1, core::vectorSIMDf InPoint);
 		static float DistanceToPlane(core::vectorSIMDf InPoint, core::vectorSIMDf PlanePoint, core::vectorSIMDf PlaneNormal);
-		static core::matrix3x4SIMD calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer);
+		static hlsl::float32_t3x4 calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer);
 
 		//! Calculates bounding box of the meshbuffer
 		static inline core::aabbox3df calculateBoundingBox(
@@ -408,8 +411,8 @@ class NBL_API2 IMeshManipulator : public virtual core::IReferenceCounted
 							if (jointID<jointCount)
 							if ((i<maxWeights ? weights[i]:weightRemainder)>FLT_MIN)
 							{
-								core::vectorSIMDf boneSpacePos;
-								inverseBindPoses[jointID].transformVect(boneSpacePos,pos);
+								const hlsl::float32_t4x4 transformationMatrix = hlsl::getMatrix3x4As4x4<hlsl::float32_t>(inverseBindPoses[jointID]);
+								core::vectorSIMDf boneSpacePos = hlsl::transformVector<hlsl::float32_t>(transformationMatrix, pos);
 								jointAABBs[jointID].addInternalPoint(boneSpacePos.getAsVector3df());
 								noJointInfluence = false;
 							}
diff --git a/include/nbl/builtin/glsl/math/quaternions.glsl b/include/nbl/builtin/glsl/math/quaternions.glsl
index 7dc6ca0279..d94d48ecc9 100644
--- a/include/nbl/builtin/glsl/math/quaternions.glsl
+++ b/include/nbl/builtin/glsl/math/quaternions.glsl
@@ -1,18 +1,13 @@
 #ifndef _NBL_BUILTIN_GLSL_MATH_QUATERNIONS_INCLUDED_
 #define _NBL_BUILTIN_GLSL_MATH_QUATERNIONS_INCLUDED_
 
-
-
 #include <nbl/builtin/glsl/math/functions.glsl>
 
-
-
 struct nbl_glsl_quaternion_t
 {
     vec4 data;
 };
 
-
 nbl_glsl_quaternion_t nbl_glsl_quaternion_t_constructFromTruncated(in vec3 first3Components)
 {
     nbl_glsl_quaternion_t quat;
diff --git a/include/nbl/builtin/hlsl/bitreverse.hlsl b/include/nbl/builtin/hlsl/bitreverse.hlsl
deleted file mode 100644
index cea9268f45..0000000000
--- a/include/nbl/builtin/hlsl/bitreverse.hlsl
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef _NBL_BUILTIN_HLSL_BITREVERSE_INCLUDED_
-#define _NBL_BUILTIN_HLSL_BITREVERSE_INCLUDED_
-
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-
-namespace nbl
-{
-namespace hlsl
-{
-
-template<typename T, uint16_t Bits NBL_FUNC_REQUIRES(is_unsigned_v<T>&& Bits <= sizeof(T) * 8)
-/**
-* @brief Takes the binary representation of `value` as a string of `Bits` bits and returns a value of the same type resulting from reversing the string
-*
-* @tparam T Type of the value to operate on.
-* @tparam Bits The length of the string of bits used to represent `value`.
-*
-* @param [in] value The value to bitreverse.
-*/
-T bitReverseAs(T value)
-{
-	return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - Bits));
-}
-
-template<typename T NBL_FUNC_REQUIRES(is_unsigned_v<T>)
-/**
-* @brief Takes the binary representation of `value` and returns a value of the same type resulting from reversing the string of bits as if it was `bits` long.
-* Keep in mind `bits` cannot exceed `8 * sizeof(T)`.
-*
-* @tparam T type of the value to operate on.
-*
-* @param [in] value The value to bitreverse.
-* @param [in] bits The length of the string of bits used to represent `value`.
-*/
-T bitReverseAs(T value, uint16_t bits)
-{
-	return bitReverse<T>(value) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - bits));
-}
-
-
-}
-}
-
-
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/camera/view_matrix.hlsl b/include/nbl/builtin/hlsl/camera/view_matrix.hlsl
new file mode 100644
index 0000000000..27b2c63239
--- /dev/null
+++ b/include/nbl/builtin/hlsl/camera/view_matrix.hlsl
@@ -0,0 +1,51 @@
+#ifndef _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_
+#define _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+
+// /Arek: glm:: for normalize till dot product is fixed (ambiguity with glm namespace + linker issues)
+template<typename T>
+inline matrix<T, 3, 4> buildCameraLookAtMatrixLH(
+	const vector<T, 3>& position,
+	const vector<T, 3>& target,
+	const vector<T, 3>& upVector)
+{
+	const vector<T, 3> zaxis = hlsl::normalize(target - position);
+	const vector<T, 3> xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis));
+	const vector<T, 3> yaxis = hlsl::cross(zaxis, xaxis);
+
+	matrix<T, 3, 4> r;
+	r[0] = vector<T, 4>(xaxis, -hlsl::dot(xaxis, position));
+	r[1] = vector<T, 4>(yaxis, -hlsl::dot(yaxis, position));
+	r[2] = vector<T, 4>(zaxis, -hlsl::dot(zaxis, position));
+
+	return r;
+}
+
+template<typename T>
+inline matrix<T, 3, 4> buildCameraLookAtMatrixRH(
+	const vector<T, 3>& position,
+	const vector<T, 3>& target,
+	const vector<T, 3>& upVector)
+{
+	const vector<T, 3> zaxis = hlsl::normalize(position - target);
+	const vector<T, 3> xaxis = hlsl::normalize(hlsl::cross(upVector, zaxis));
+	const vector<T, 3> yaxis = hlsl::cross(zaxis, xaxis);
+
+	matrix<T, 3, 4> r;
+	r[0] = vector<T, 4>(xaxis, -hlsl::dot(xaxis, position));
+	r[1] = vector<T, 4>(yaxis, -hlsl::dot(yaxis, position));
+	r[2] = vector<T, 4>(zaxis, -hlsl::dot(zaxis, position));
+
+	return r;
+}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index 29660b8b45..34c4c2c542 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -4,7 +4,6 @@
 #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
 #define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
 
-
 #include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
 #include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 #include <nbl/builtin/hlsl/utility.hlsl>
@@ -68,7 +67,7 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP
 //
 #define NBL_IMPL_CONCEPT_REQ_TYPE(...) typename __VA_ARGS__;
 #define NBL_IMPL_CONCEPT_REQ_EXPR(...) __VA_ARGS__;
-#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C<decltype E,__VA_ARGS__ >;
+#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C<decltype E __VA_OPT__(,) __VA_ARGS__ >;
 //
 #define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
 //
@@ -77,56 +76,8 @@ concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP
 #define NBL_CONCEPT_END(SEQ) BOOST_PP_SEQ_FOR_EACH_I(NBL_IMPL_CONCEPT_END_DEF, DUMMY, SEQ) \
 }
 
-
-#include <concepts>
-
-// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use
-// the macros here.
-template <typename T, typename U>
-concept same_as = std::same_as<T, U>;
-
-template <typename D, typename B>
-concept derived_from = std::derived_from<D, B>;
-
-template <typename F, typename T>
-concept convertible_to = std::convertible_to<F, T>;
-
-template <typename T, typename F>
-concept assignable_from = std::assignable_from<T, F>;
-
-template <typename T, typename U>
-concept common_with = std::common_with<T, U>;
-
-template <typename T>
-concept integral = std::integral<T>;
-
-template <typename T>
-concept signed_integral = std::signed_integral<T>;
-
-template <typename T>
-concept unsigned_integral = std::unsigned_integral<T>;
-
-template <typename T>
-concept floating_point = std::floating_point<T>;
-
-
-// Some other useful concepts.
-
-template<typename T, typename... Ts>
-concept any_of = (same_as<T, Ts> || ...);
-
-template <typename T>
-concept scalar = floating_point<T> || integral<T>;
-
-template <typename T>
-concept vectorial = is_vector<T>::value;
-
-template <typename T>
-concept matricial = is_matrix<T>::value;
-
 #else
 
-
 // to define a concept using `concept Name = SomeContexprBoolCondition<T>;`
 #define NBL_BOOL_CONCEPT NBL_CONSTEXPR bool
 
@@ -144,7 +95,6 @@ concept matricial = is_matrix<T>::value;
 // condition, use instead of the closing `>` of a function template
 #define NBL_FUNC_REQUIRES(...) ,::nbl::hlsl::enable_if_t<(__VA_ARGS__),bool> = true>
 
-
 //
 #define NBL_CONCEPT_BEGIN(LOCAL_PARAM_COUNT) namespace BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME) \
 {
@@ -153,7 +103,7 @@ concept matricial = is_matrix<T>::value;
 //
 #define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t<typename __VA_ARGS__ >
 #define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t<decltype(__VA_ARGS__)>
-#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t<C<decltype E ,__VA_ARGS__  > >
+#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t<C<decltype E __VA_OPT__(,) __VA_ARGS__  > >
 //
 #define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
 //
diff --git a/include/nbl/builtin/hlsl/concepts/core.hlsl b/include/nbl/builtin/hlsl/concepts/core.hlsl
new file mode 100644
index 0000000000..4e20c645c8
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/core.hlsl
@@ -0,0 +1,83 @@
+// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_CORE_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_CORE_HLSL_INCLUDED_
+
+
+#include <nbl/builtin/hlsl/concepts.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+
+template<typename T, typename U>
+NBL_BOOL_CONCEPT same_as = is_same_v<T, U>;
+
+template<typename T>
+NBL_BOOL_CONCEPT Integral = nbl::hlsl::is_integral_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT SignedIntegral = nbl::hlsl::is_signed_v<T> && nbl::hlsl::is_integral_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT UnsignedIntegral = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT Boolean = nbl::hlsl::is_same_v<T, bool> || (nbl::hlsl::is_vector_v<T> && nbl::hlsl::is_same_v<typename vector_traits<T>::scalar_type, bool>);
+
+template <typename T>
+NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT IntegralScalar = nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT SignedIntegralScalar = nbl::hlsl::is_signed_v<T> && nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT UnsignedIntegralScalar = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
+
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPointScalar = nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>;
+
+// TODO: implement when hlsl::is_base_of is done
+//#define NBL_CONCEPT_NAME DerivedFrom
+// ...
+
+// TODO: implement when hlsl::is_converible is done
+//#define NBL_CONCEPT_NAME ConvertibleTo
+// ...
+
+// TODO?
+//#define NBL_CONCEPT_NAME AssignableFrom
+
+// TODO?
+//template <typename T, typename U>
+//concept common_with = std::common_with<T, U>;
+
+namespace impl
+{
+template<typename T>
+struct is_emulating_floating_point_scalar
+{
+	NBL_CONSTEXPR_STATIC_INLINE bool value = FloatingPointScalar<T>;
+};
+}
+
+//! Floating point types are native floating point types or types that imitate native floating point types (for example emulated_float64_t)
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPointLikeScalar = impl::is_emulating_floating_point_scalar<T>::value;
+
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/concepts/matrix.hlsl b/include/nbl/builtin/hlsl/concepts/matrix.hlsl
new file mode 100644
index 0000000000..94659c823b
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/matrix.hlsl
@@ -0,0 +1,27 @@
+// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_MATRIX_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_MATRIX_HLSL_INCLUDED_
+
+
+#include <nbl/builtin/hlsl/concepts.hlsl>
+#include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+
+template<typename T>
+NBL_BOOL_CONCEPT Matrix = is_matrix<T>::value;
+
+template<typename T>
+NBL_BOOL_CONCEPT Matricial = matrix_traits<T>::IsMatrix;
+
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/concepts/vector.hlsl b/include/nbl/builtin/hlsl/concepts/vector.hlsl
new file mode 100644
index 0000000000..edea37a183
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/vector.hlsl
@@ -0,0 +1,48 @@
+// Copyright (C) 2024-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_VECTOR_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_VECTOR_HLSL_INCLUDED_
+
+
+#include <nbl/builtin/hlsl/concepts.hlsl>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+
+//! Concept for native vectors.
+template<typename T>
+NBL_BOOL_CONCEPT Vector = is_vector<T>::value;
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPointVector = concepts::Vector<T> && concepts::FloatingPointScalar<typename vector_traits<T>::scalar_type>;
+template<typename T>
+NBL_BOOL_CONCEPT IntVector = concepts::Vector<T> && (is_integral_v<typename vector_traits<T>::scalar_type>);
+template<typename T>
+NBL_BOOL_CONCEPT SignedIntVector = concepts::Vector<T> && concepts::SignedIntegralScalar<typename vector_traits<T>::scalar_type>;
+
+//! Concept for native vectors and vector like structs.
+template<typename T>
+NBL_BOOL_CONCEPT Vectorial = vector_traits<T>::IsVector;
+
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPointVectorial = concepts::Vectorial<T> && concepts::FloatingPointScalar<typename vector_traits<T>::scalar_type>;
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPointLikeVectorial = concepts::Vectorial<T> && concepts::FloatingPointLikeScalar<typename vector_traits<T>::scalar_type>;
+template<typename T>
+NBL_BOOL_CONCEPT IntVectorial = concepts::Vectorial<T> && (is_integral_v<typename vector_traits<T>::scalar_type>);
+template<typename T>
+NBL_BOOL_CONCEPT SignedIntVectorial = concepts::Vectorial<T> && concepts::SignedIntegralScalar<typename vector_traits<T>::scalar_type>;
+
+}
+}
+}
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
index 7520acbd19..c9ebf7fcf2 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
@@ -6,6 +6,15 @@
 #include <nbl/builtin/hlsl/concepts.hlsl>
 #include <nbl/builtin/hlsl/spirv_intrinsics/core.hlsl>
 #include <nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
+#include <nbl/builtin/hlsl/concepts/vector.hlsl>
+#include <nbl/builtin/hlsl/concepts/matrix.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/promote.hlsl>
+#include <nbl/builtin/hlsl/numbers.hlsl>
+#include <boost/preprocessor/comparison/not_equal.hpp>
+#include <boost/preprocessor/punctuation/comma_if.hpp>
+#include <boost/preprocessor/seq/for_each_i.hpp>
 
 namespace nbl
 {
@@ -13,418 +22,746 @@ namespace hlsl
 {
 namespace cpp_compat_intrinsics_impl
 {
-template<typename T>
-struct dot_helper
-{
-	using scalar_type = typename vector_traits<T>::scalar_type;
 
-	static inline scalar_type dot_product(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs)
-	{
-		static array_get<T, scalar_type> getter;
-		scalar_type retval = getter(lhs, 0) * getter(rhs, 0);
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct dot_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct cross_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct clamp_helper;
+template<typename Integer NBL_STRUCT_CONSTRAINABLE>
+struct find_msb_helper;
+template<typename Integer NBL_STRUCT_CONSTRAINABLE>
+struct find_lsb_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct bitReverse_helper;
+template<typename Matrix NBL_STRUCT_CONSTRAINABLE>
+struct transpose_helper;
+template<typename Vector NBL_STRUCT_CONSTRAINABLE>
+struct length_helper;
+template<typename Vector NBL_STRUCT_CONSTRAINABLE>
+struct normalize_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct max_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct min_helper;
+template<typename Integer NBL_STRUCT_CONSTRAINABLE>
+struct bitCount_helper;
+template<typename LhsT, typename RhsT NBL_STRUCT_CONSTRAINABLE>
+struct mul_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct determinant_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct inverse_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct rsqrt_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct all_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct any_helper;
+template<typename T, uint16_t Bits NBL_STRUCT_CONSTRAINABLE>
+struct bitReverseAs_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct frac_helper;
+template<typename T, typename U NBL_STRUCT_CONSTRAINABLE>
+struct mix_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct sign_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct radians_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct degrees_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct step_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct smoothStep_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct faceForward_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct reflect_helper;
+template<typename T, typename U NBL_STRUCT_CONSTRAINABLE>
+struct refract_helper;
 
-		static const uint32_t ArrayDim = vector_traits<T>::Dimension;
-		for (uint32_t i = 1; i < ArrayDim; ++i)
-			retval = retval + getter(lhs, i) * getter(rhs, i);
+#ifdef __HLSL_VERSION // HLSL only specializations
 
-		return retval;
-	}
-};
+// it is crucial these partial specializations appear first because thats what makes the helpers match SPIR-V intrinsics first
 
-#define DEFINE_BUILTIN_VECTOR_SPECIALIZATION(FLOAT_TYPE, RETURN_VALUE)\
-template<uint32_t N>\
-struct dot_helper<vector<FLOAT_TYPE, N> >\
+#define DECLVAL(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) experimental::declval<_T>()
+#define DECL_ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) const _T arg##i
+#define WRAP(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) _T
+#define ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) arg##i
+
+// the template<> needs to be written ourselves
+// return type is __VA_ARGS__ to protect against `,` in templated return types
+#define AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(HELPER_NAME, SPIRV_FUNCTION_NAME, ARG_TYPE_LIST, ARG_TYPE_SET, ...)\
+NBL_PARTIAL_REQ_TOP(is_same_v<decltype(spirv::SPIRV_FUNCTION_NAME<T>(BOOST_PP_SEQ_FOR_EACH_I(DECLVAL, _, ARG_TYPE_SET))), __VA_ARGS__ >) \
+struct HELPER_NAME<BOOST_PP_SEQ_FOR_EACH_I(WRAP, _, ARG_TYPE_LIST) NBL_PARTIAL_REQ_BOT(is_same_v<decltype(spirv::SPIRV_FUNCTION_NAME<T>(BOOST_PP_SEQ_FOR_EACH_I(DECLVAL, _, ARG_TYPE_SET))), __VA_ARGS__ >) >\
 {\
-	using VectorType = vector<FLOAT_TYPE, N>;\
-	using ScalarType = typename vector_traits<VectorType>::scalar_type;\
-\
-	static inline ScalarType dot_product(NBL_CONST_REF_ARG(VectorType) lhs, NBL_CONST_REF_ARG(VectorType) rhs)\
+	using return_t = __VA_ARGS__;\
+	static inline return_t __call( BOOST_PP_SEQ_FOR_EACH_I(DECL_ARG, _, ARG_TYPE_SET) )\
 	{\
-		return RETURN_VALUE;\
+		return spirv::SPIRV_FUNCTION_NAME<T>( BOOST_PP_SEQ_FOR_EACH_I(ARG, _, ARG_TYPE_SET) );\
 	}\
-};\
+};
 
-#ifdef __HLSL_VERSION
-#define BUILTIN_VECTOR_SPECIALIZATION_RET_VAL dot(lhs, rhs)
-#else
-#define BUILTIN_VECTOR_SPECIALIZATION_RET_VAL glm::dot(lhs, rhs)
-#endif
+#define FIND_MSB_LSB_RETURN_TYPE conditional_t<is_vector_v<T>, vector<int32_t, vector_traits<T>::Dimension>, int32_t>
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(find_msb_helper, findUMsb, (T), (T), FIND_MSB_LSB_RETURN_TYPE);
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(find_msb_helper, findSMsb, (T), (T), FIND_MSB_LSB_RETURN_TYPE)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(find_lsb_helper, findILsb, (T), (T), FIND_MSB_LSB_RETURN_TYPE)
+#undef FIND_MSB_LSB_RETURN_TYPE
+
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(bitReverse_helper, bitReverse, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(transpose_helper, transpose, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(length_helper, length, (T), (T), typename vector_traits<T>::scalar_type)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(normalize_helper, normalize, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(rsqrt_helper, inverseSqrt, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(frac_helper, fract, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(all_helper, any, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(any_helper, any, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sign_helper, fSign, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sign_helper, sSign, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(radians_helper, radians, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(degrees_helper, degrees, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(max_helper, fMax, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(max_helper, uMax, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(max_helper, sMax, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(min_helper, fMin, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(min_helper, uMin, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(min_helper, sMin, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(step_helper, step, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(reflect_helper, reflect, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(clamp_helper, fClamp, (T), (T)(T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(clamp_helper, uClamp, (T), (T)(T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(clamp_helper, sClamp, (T), (T)(T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(smoothStep_helper, smoothStep, (T), (T)(T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(faceForward_helper, faceForward, (T), (T)(T)(T), T)
+template<typename T, typename U> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(refract_helper, refract, (T)(U), (T)(T)(U), T)
+
+#define BITCOUNT_HELPER_RETRUN_TYPE conditional_t<is_vector_v<T>, vector<int32_t, vector_traits<T>::Dimension>, int32_t>
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(bitCount_helper, bitCount, (T), (T), BITCOUNT_HELPER_RETRUN_TYPE)
+#undef BITCOUNT_HELPER_RETRUN_TYPE
+
+#undef DECLVAL
+#undef DECL_ARG
+#undef WRAP
+#undef ARG
+#undef AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER
+
+template<typename UInt64> NBL_PARTIAL_REQ_TOP(is_same_v<UInt64, uint64_t>)
+struct find_msb_helper<UInt64 NBL_PARTIAL_REQ_BOT(is_same_v<UInt64, uint64_t>) >
+{
+	using return_t = int32_t;
+	static return_t __call(NBL_CONST_REF_ARG(UInt64) val)
+	{
+		const uint32_t highBits = uint32_t(val >> 32);
+		const int32_t highMsb = find_msb_helper<uint32_t>::__call(highBits);
 
-DEFINE_BUILTIN_VECTOR_SPECIALIZATION(float16_t, BUILTIN_VECTOR_SPECIALIZATION_RET_VAL)
-DEFINE_BUILTIN_VECTOR_SPECIALIZATION(float32_t, BUILTIN_VECTOR_SPECIALIZATION_RET_VAL)
-DEFINE_BUILTIN_VECTOR_SPECIALIZATION(float64_t, BUILTIN_VECTOR_SPECIALIZATION_RET_VAL)
+		if (highMsb == -1)
+		{
+			const uint32_t lowBits = uint32_t(val);
+			const int32_t lowMsb = find_msb_helper<uint32_t>::__call(lowBits);
+			if (lowMsb == -1)
+				return -1;
 
-#undef BUILTIN_VECTOR_SPECIALIZATION_RET_VAL
-#undef DEFINE_BUILTIN_VECTOR_SPECIALIZATION
+			return lowMsb;
+		}
 
-template<typename Integer>
-struct find_msb_helper;
+		return highMsb + 32;
+	}
+};
+template<typename UInt64> NBL_PARTIAL_REQ_TOP(is_same_v<UInt64, uint64_t>)
+struct find_lsb_helper<UInt64 NBL_PARTIAL_REQ_BOT(is_same_v<UInt64, uint64_t>) >
+{
+	static int32_t __call(NBL_CONST_REF_ARG(uint64_t) val)
+	{
+		const uint32_t lowBits = uint32_t(val);
+		const int32_t lowLsb = find_lsb_helper<uint32_t>::__call(lowBits);
 
-template<>
-struct find_msb_helper<uint32_t>
+		if (lowLsb == -1)
+		{
+			const uint32_t highBits = uint32_t(val >> 32);
+			const int32_t highLsb = find_lsb_helper<uint32_t>::__call(highBits);
+			if (highLsb == -1)
+				return -1;
+			else
+				return 32 + highLsb;
+		}
+
+		return lowLsb;
+	}
+};
+
+template<typename SquareMatrix>
+NBL_PARTIAL_REQ_TOP(concepts::Matrix<SquareMatrix>&& matrix_traits<SquareMatrix>::Square)
+struct inverse_helper<SquareMatrix NBL_PARTIAL_REQ_BOT(concepts::Matrix<SquareMatrix>&& matrix_traits<SquareMatrix>::Square) >
 {
-	static int32_t findMSB(NBL_CONST_REF_ARG(uint32_t) val)
+	static SquareMatrix __call(NBL_CONST_REF_ARG(SquareMatrix) mat)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findUMsb(val);
-#else
-		return glm::findMSB(val);
-#endif
+		return spirv::matrixInverse(mat);
 	}
 };
 
-template<>
-struct find_msb_helper<int32_t>
+template<typename T, typename U> NBL_PARTIAL_REQ_TOP(always_true<decltype(spirv::fMix<T>(experimental::declval<T>(), experimental::declval<T>(), experimental::declval<U>()))>)
+struct mix_helper<T, U NBL_PARTIAL_REQ_BOT(always_true<decltype(spirv::fMix<T>(experimental::declval<T>(), experimental::declval<T>(), experimental::declval<U>()))>) >
 {
-	static int32_t findMSB(NBL_CONST_REF_ARG(int32_t) val)
+	using return_t = conditional_t<is_vector_v<T>, vector<typename vector_traits<T>::scalar_type, vector_traits<T>::Dimension>, T>;
+	static inline return_t __call(const T x, const T y, const U a)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findSMsb(val);
-#else
-		return glm::findMSB(val);
-#endif
+		T aAsT = a;
+		return spirv::fMix<T>(x, y, aAsT);
+	}
+};
+
+template<typename SquareMatrix> NBL_PARTIAL_REQ_TOP(matrix_traits<SquareMatrix>::Square)
+struct determinant_helper<SquareMatrix NBL_PARTIAL_REQ_BOT(matrix_traits<SquareMatrix>::Square) >
+{
+	static typename matrix_traits<SquareMatrix>::scalar_type __call(NBL_CONST_REF_ARG(SquareMatrix) mat)
+	{
+		return spirv::determinant(mat);
 	}
 };
 
-#define DEFINE_FIND_MSB_COMMON_SPECIALIZATION(INPUT_INTEGER_TYPE, INTEGER_TYPE)\
-template<>\
-struct find_msb_helper<INPUT_INTEGER_TYPE>\
+#else // C++ only specializations
+
+#define DECL_ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) const _T arg##i
+#define WRAP(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) _T
+#define ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) arg##i
+
+// the template<> needs to be written ourselves
+// return type is __VA_ARGS__ to protect against `,` in templated return types
+#define AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(HELPER_NAME, STD_FUNCTION_NAME, REQUIREMENT, ARG_TYPE_LIST, ARG_TYPE_SET, ...)\
+requires REQUIREMENT \
+struct HELPER_NAME<BOOST_PP_SEQ_FOR_EACH_I(WRAP, _, ARG_TYPE_LIST)>\
 {\
-	static int32_t findMSB(NBL_CONST_REF_ARG(INPUT_INTEGER_TYPE) val)\
+	using return_t = __VA_ARGS__;\
+	static inline return_t __call( BOOST_PP_SEQ_FOR_EACH_I(DECL_ARG, _, ARG_TYPE_SET) )\
 	{\
-		return find_msb_helper<INTEGER_TYPE>::findMSB(val);\
+		return std::STD_FUNCTION_NAME<BOOST_PP_SEQ_FOR_EACH_I(WRAP, _, ARG_TYPE_LIST)>( BOOST_PP_SEQ_FOR_EACH_I(ARG, _, ARG_TYPE_SET) );\
 	}\
-};\
+};
 
-DEFINE_FIND_MSB_COMMON_SPECIALIZATION(int16_t, int32_t)
-DEFINE_FIND_MSB_COMMON_SPECIALIZATION(uint16_t, uint32_t)
-#ifndef __HLSL_VERSION
-DEFINE_FIND_MSB_COMMON_SPECIALIZATION(int8_t, int32_t)
-DEFINE_FIND_MSB_COMMON_SPECIALIZATION(uint8_t, uint32_t)
-#endif
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(clamp_helper, clamp, concepts::Scalar<T>, (T), (T)(T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(max_helper, max, concepts::Scalar<T>, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(min_helper, min, concepts::Scalar<T>, (T), (T)(T), T)
+
+#undef DECL_ARG
+#undef WRAP
+#undef ARG
+#undef AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER
 
-template<>
-struct find_msb_helper<uint64_t>
+template<typename T>
+requires concepts::IntegralScalar<T>
+struct bitReverse_helper<T>
 {
-	static int32_t findMSB(NBL_CONST_REF_ARG(uint64_t) val)
+	static inline T __call(NBL_CONST_REF_ARG(T) arg)
 	{
-#ifdef __HLSL_VERSION
-		const uint32_t highBits = uint32_t(val >> 32);
-		const int32_t highMsb = find_msb_helper<uint32_t>::findMSB(highBits);
-
-		if (highMsb == -1)
-		{
-			const uint32_t lowBits = uint32_t(val);
-			const int32_t lowMsb = find_msb_helper<uint32_t>::findMSB(lowBits);
-			if (lowMsb == -1)
-				return -1;
-
-			return lowMsb;
-		}
-
-		return highMsb + 32;
-#else
-		return glm::findMSB(val);
-#endif
+		return glm::bitfieldReverse<T>(arg);
 	}
 };
+template<typename Matrix>
+requires concepts::Matrix<Matrix>
+struct transpose_helper<Matrix>
+{
+	using transposed_t = typename matrix_traits<Matrix>::transposed_type;
 
-template<int N>
-struct find_msb_helper<vector<uint32_t, N> >
+	static transposed_t __call(NBL_CONST_REF_ARG(Matrix) m)
+	{
+		return reinterpret_cast<transposed_t&>(glm::transpose(reinterpret_cast<typename Matrix::Base const&>(m)));
+	}
+};
+template<typename Vector>
+requires concepts::FloatingPointVector<Vector>
+struct length_helper<Vector>
 {
-	static vector<int32_t, N> findMSB(NBL_CONST_REF_ARG(vector<uint32_t, N>) val)
+	static inline typename vector_traits<Vector>::scalar_type __call(NBL_CONST_REF_ARG(Vector) vec)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findUMsb(val);
-#else
-		return glm::findMSB(val);
-#endif
+		return std::sqrt(dot_helper<Vector>::__call(vec, vec));
 	}
 };
-
-template<int N>
-struct find_msb_helper<vector<int32_t, N> >
+template<typename Vectorial>
+requires concepts::FloatingPointLikeVectorial<Vectorial>
+struct normalize_helper<Vectorial>
 {
-	static vector<int32_t, N> findMSB(NBL_CONST_REF_ARG(vector<int32_t, N>) val)
+	static inline Vectorial __call(NBL_CONST_REF_ARG(Vectorial) vec)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findSMsb(val);
-#else
-		return glm::findMSB(val);
-#endif
+		return vec / length_helper<Vectorial>::__call(vec);
 	}
 };
 
-#ifndef __HLSL_VERSION
-
+template<typename T>
+requires concepts::IntegralScalar<T>
+struct find_lsb_helper<T>
+{
+	using return_t = int32_t;
+	static inline T __call(const T arg)
+	{
+		return glm::findLSB<T>(arg);
+	}
+};
+template<typename Integer>
+NBL_PARTIAL_REQ_TOP(concepts::IntegralScalar<Integer>)
+struct find_msb_helper<Integer NBL_PARTIAL_REQ_BOT(concepts::IntegralScalar<Integer>) >
+{
+	using return_t = int32_t;
+	static return_t __call(NBL_CONST_REF_ARG(Integer) val)
+	{
+		return glm::findMSB<Integer>(val);
+	}
+};
+// TODO: implemet to be compatible with both C++ and HLSL when it works with DXC
+template<typename EnumType>
+requires std::is_enum_v<EnumType>
+struct find_lsb_helper<EnumType>
+{
+	using return_t = int32_t;
+	static int32_t __call(NBL_CONST_REF_ARG(EnumType) val)
+	{
+		using underlying_t = std::underlying_type_t<EnumType>;
+		return find_lsb_helper<underlying_t>::__call(static_cast<underlying_t>(val));
+	}
+};
 template<typename EnumType>
-	requires std::is_enum_v<EnumType>
+requires std::is_enum_v<EnumType>
 struct find_msb_helper<EnumType>
 {
-	static int32_t findMSB(NBL_CONST_REF_ARG(EnumType) val)
+	using return_t = int32_t;
+	static return_t __call(NBL_CONST_REF_ARG(EnumType) val)
 	{
 		using underlying_t = std::underlying_type_t<EnumType>;
-		return find_msb_helper<underlying_t>::findMSB(static_cast<underlying_t>(val));
+		return find_msb_helper<underlying_t>::__call(static_cast<underlying_t>(val));
 	}
 };
 
-#endif
+template<typename FloatingPoint>
+requires concepts::FloatingPointScalar<FloatingPoint>
+struct rsqrt_helper<FloatingPoint>
+{
+	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) x)
+	{
+		// TODO: https://stackoverflow.com/a/62239778
+		return 1.0f / std::sqrt(x);
+	}
+};
 
-template<typename Integer>
-struct find_lsb_helper;
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct frac_helper<T>
+{
+	using return_t = T;
+	static inline return_t __call(const T x)
+	{
+		return x - std::floor(x);
+	}
+};
 
-template<>
-struct find_lsb_helper<int32_t>
+template<typename Integer>
+requires concepts::IntegralScalar<Integer>
+struct bitCount_helper<Integer>
 {
-	static int32_t findLSB(NBL_CONST_REF_ARG(int32_t) val)
+	using return_t = int32_t;
+	static return_t __call(NBL_CONST_REF_ARG(Integer) val)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findILsb(val);
-#else
-		return glm::findLSB(val);
-#endif
+		using UnsignedInteger = typename hlsl::unsigned_integer_of_size_t<sizeof(Integer)>;
+		return std::popcount(static_cast<UnsignedInteger>(val));
 	}
 };
 
-template<>
-struct find_lsb_helper<uint32_t>
+template<typename SquareMatrix>
+requires concepts::Matrix<SquareMatrix> && matrix_traits<SquareMatrix>::Square
+struct inverse_helper<SquareMatrix>
 {
-	static int32_t findLSB(NBL_CONST_REF_ARG(uint32_t) val)
+	static SquareMatrix __call(NBL_CONST_REF_ARG(SquareMatrix) mat)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findILsb(val);
-#else
-		return glm::findLSB(val);
-#endif
+		return reinterpret_cast<SquareMatrix&>(glm::inverse(reinterpret_cast<typename SquareMatrix::Base const&>(mat)));
 	}
 };
 
-#define DEFINE_FIND_LSB_COMMON_SPECIALIZATION(INPUT_INTEGER_TYPE, INTEGER_TYPE)\
-template<>\
-struct find_lsb_helper<INPUT_INTEGER_TYPE>\
-{\
-	static int32_t findLSB(NBL_CONST_REF_ARG(INPUT_INTEGER_TYPE) val)\
-	{\
-		return find_lsb_helper<INTEGER_TYPE>::findLSB(val);\
-	}\
-};\
+template<typename EnumT>
+requires std::is_enum_v<EnumT>
+struct bitCount_helper<EnumT>
+{
+	using return_t = int32_t;
+	using underlying_t = std::underlying_type_t<EnumT>;
+	static return_t __call(NBL_CONST_REF_ARG(EnumT) val)
+	{
+		return bitCount_helper<const underlying_t>::__call(reinterpret_cast<const underlying_t&>(val));
+	}
+};
 
-DEFINE_FIND_LSB_COMMON_SPECIALIZATION(int16_t, int32_t)
-DEFINE_FIND_LSB_COMMON_SPECIALIZATION(uint16_t, uint32_t)
-#ifndef __HLSL_VERSION
-DEFINE_FIND_LSB_COMMON_SPECIALIZATION(int8_t, int32_t)
-DEFINE_FIND_LSB_COMMON_SPECIALIZATION(uint8_t, uint32_t)
-#endif
+template<typename T, typename U>
+requires concepts::FloatingPoint<T> && (concepts::FloatingPoint<T> || concepts::Boolean<T>)
+struct mix_helper<T, U>
+{
+	using return_t = T;
+	static inline return_t __call(const T x, const T y, const U a)
+	{
+		return glm::mix(x, y, a);
+	}
+};
 
-template<>
-struct find_lsb_helper<uint64_t>
+template<typename T>
+requires concepts::FloatingPointScalar<T> || concepts::IntegralScalar<T>
+struct sign_helper<T>
 {
-	static int32_t findLSB(NBL_CONST_REF_ARG(uint64_t) val)
+	using return_t = T;
+	static inline return_t __call(const T val)
 	{
-#ifdef __HLSL_VERSION
-		const uint32_t lowBits = uint32_t(val);
-		const int32_t lowLsb = find_lsb_helper<uint32_t>::findLSB(lowBits);
+		if (val < 0)
+			return -1;
+		if (val > 0)
+			return 1;
 
-		if (lowLsb == -1)
-		{
-			const uint32_t highBits = uint32_t(val >> 32);
-			const int32_t highLsb = find_lsb_helper<uint32_t>::findLSB(highBits);
-			if (highLsb == -1)
-				return -1;
-			else
-				return 32 + highLsb;
-		}
+		return 0;
+	}
+};
 
-		return lowLsb;
-#else
-		return glm::findLSB(val);
-#endif
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct radians_helper<T>
+{
+	using return_t = T;
+	static inline return_t __call(const T degrees)
+	{
+		return degrees * (numbers::pi<T> / static_cast<T>(180.0));
 	}
 };
 
-template<int N>
-struct find_lsb_helper<vector<int32_t, N> >
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct degrees_helper<T>
 {
-	static vector<int32_t, N> findLSB(NBL_CONST_REF_ARG(vector<int32_t, N>) val)
+	using return_t = T;
+	static inline return_t __call(const T radians)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findILsb(val);
-#else
-		return glm::findLSB(val);
-#endif
+		return radians * (static_cast<T>(180.0) / numbers::pi<T>);
 	}
 };
 
-template<int N>
-struct find_lsb_helper<vector<uint32_t, N> >
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct step_helper<T>
 {
-	static vector<int32_t, N> findLSB(NBL_CONST_REF_ARG(vector<uint32_t, N>) val)
+	using return_t = T;
+	static inline return_t __call(const T edge, const T x)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::findILsb(val);
-#else
-		return glm::findLSB(val);
-#endif
+		return x < edge ? 0.0 : 1.0;
 	}
 };
 
-#ifndef __HLSL_VERSION
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct smoothStep_helper<T>
+{
+	using return_t = T;
+	static inline return_t __call(const T edge0, const T edge1, const T x)
+	{
+		T t = clamp_helper<T>::__call((x - edge0) / (edge1 - edge0), 0, 1);
+		return t * t * (3 - 2 * t);
+	}
+};
 
-template<typename EnumType>
-requires std::is_enum_v<EnumType>
-struct find_lsb_helper<EnumType>
+template<typename SquareMatrix>
+NBL_PARTIAL_REQ_TOP(matrix_traits<SquareMatrix>::Square)
+struct determinant_helper<SquareMatrix NBL_PARTIAL_REQ_BOT(matrix_traits<SquareMatrix>::Square) >
 {
-	static int32_t findLSB(NBL_CONST_REF_ARG(EnumType) val)
+	static typename matrix_traits<SquareMatrix>::scalar_type __call(NBL_CONST_REF_ARG(SquareMatrix) mat)
 	{
-		using underlying_t = std::underlying_type_t<EnumType>;
-		return find_lsb_helper<underlying_t>::findLSB(static_cast<underlying_t>(val));
+		return glm::determinant(reinterpret_cast<typename SquareMatrix::Base const&>(mat));
 	}
 };
 
-#endif
+template<typename T>
+requires concepts::FloatingPointVectorial<T>
+struct faceForward_helper<T>
+{
+	using return_t = T;
+	static inline return_t __call(const T N, const T I, const T Nref)
+	{
+		if (dot_helper<T>::__call(Nref, I) < 0.0)
+			return N;
+		else
+			return -N;
+	}
+};
 
-template<typename Integer>
-struct find_msb_return_type
+template<typename T>
+requires concepts::FloatingPointVector<T>
+struct reflect_helper<T>
 {
-	using type = int32_t;
+	using return_t = T;
+	static inline return_t __call(const T I, const T N)
+	{
+		return I - T(2.0 * dot_helper<T>::__call(N, I)) * N;
+	}
 };
-template<typename Integer, int N>
-struct find_msb_return_type<vector<Integer, N> >
+
+template<typename T, typename U>
+requires concepts::FloatingPointVector<T> && concepts::FloatingPointScalar<U>
+struct refract_helper<T, U>
 {
-	using type = vector<int32_t, N>;
+	using return_t = T;
+	static inline return_t __call(const T I, const T N, const U eta)
+	{
+		U k = 1.0 - eta * eta * (1.0 - dot_helper<T>::__call(N, I) * dot_helper<T>::__call(N, I));
+		if (k < 0.0)
+			return T(0.0);
+		
+		return eta * I - (eta * dot_helper<T>::__call(N, I) + std::sqrt(k)) * N;
+	}
 };
-template<typename Integer>
-using find_lsb_return_type = find_msb_return_type<Integer>;
 
-template<typename T NBL_STRUCT_CONSTRAINABLE>
-struct bitReverse_helper;
+#endif // C++ only specializations
 
-template<typename Integer>
-NBL_PARTIAL_REQ_TOP(hlsl::is_integral_v<Integer> && hlsl::is_scalar_v<Integer>)
-struct bitReverse_helper<Integer NBL_PARTIAL_REQ_BOT(hlsl::is_integral_v<Integer>&& hlsl::is_scalar_v<Integer>) >
+// C++ and HLSL specializations
+
+template<typename T, uint16_t Bits>
+NBL_PARTIAL_REQ_TOP(concepts::UnsignedIntegralScalar<T> && (Bits <= sizeof(T) * 8))
+struct bitReverseAs_helper<T, Bits NBL_PARTIAL_REQ_BOT(concepts::UnsignedIntegralScalar<T> && (Bits <= sizeof(T) * 8)) >
 {
-	static inline Integer __call(NBL_CONST_REF_ARG(Integer) val)
+	static T __call(NBL_CONST_REF_ARG(T) val)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::bitReverse(val);
-#else
-		return glm::bitfieldReverse(val);
-#endif
+		return bitReverse_helper<T>::__call(val) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - Bits));
+	}
+
+	static T __call(NBL_CONST_REF_ARG(T) val, uint16_t bits)
+	{
+		return bitReverse_helper<T>::__call(val) >> promote<T, scalar_type_t<T> >(scalar_type_t <T>(sizeof(T) * 8 - bits));
 	}
 };
 
-template<typename Vector>
-NBL_PARTIAL_REQ_TOP(hlsl::is_vector_v<Vector>)
-struct bitReverse_helper<Vector NBL_PARTIAL_REQ_BOT(hlsl::is_integral_v<Vector> && hlsl::is_vector_v<Vector>) >
+template<typename Vectorial>
+NBL_PARTIAL_REQ_TOP(concepts::Vectorial<Vectorial>)
+struct dot_helper<Vectorial NBL_PARTIAL_REQ_BOT(concepts::Vectorial<Vectorial>) >
+{
+	using scalar_type = typename vector_traits<Vectorial>::scalar_type;
+
+	static inline scalar_type __call(NBL_CONST_REF_ARG(Vectorial) lhs, NBL_CONST_REF_ARG(Vectorial) rhs)
+	{
+		static const uint32_t ArrayDim = vector_traits<Vectorial>::Dimension;
+		static array_get<Vectorial, scalar_type> getter;
+
+		scalar_type retval = getter(lhs, 0) * getter(rhs, 0);
+		for (uint32_t i = 1; i < ArrayDim; ++i)
+			retval = retval + getter(lhs, i) * getter(rhs, i);
+
+		return retval;
+	}
+};
+template<typename FloatingPointLikeVectorial>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeVectorial<FloatingPointLikeVectorial> && (vector_traits<FloatingPointLikeVectorial>::Dimension == 3))
+struct cross_helper<FloatingPointLikeVectorial NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<FloatingPointLikeVectorial> && (vector_traits<FloatingPointLikeVectorial>::Dimension == 3)) >
 {
-	static Vector __call(NBL_CONST_REF_ARG(Vector) vec)
+	static FloatingPointLikeVectorial __call(NBL_CONST_REF_ARG(FloatingPointLikeVectorial) lhs, NBL_CONST_REF_ARG(FloatingPointLikeVectorial) rhs)
 	{
 #ifdef __HLSL_VERSION
-		return spirv::bitReverse(vec);
+		return spirv::cross(lhs, rhs);
 #else
-		Vector output;
-		using traits = hlsl::vector_traits<Vector>;
-		for (uint32_t i = 0; i < traits::Dimension; ++i)
-			output[i] = bitReverse_helper<scalar_type_t<Vector> >::__call(vec[i]);
+		using traits = hlsl::vector_traits<FloatingPointLikeVectorial>;
+		array_get<FloatingPointLikeVectorial, typename traits::scalar_type> getter;
+		array_set<FloatingPointLikeVectorial, typename traits::scalar_type> setter;
+
+		FloatingPointLikeVectorial output;
+		setter(output, 0, getter(lhs, 1) * getter(rhs, 2) - getter(rhs, 1) * getter(lhs, 2));
+		setter(output, 1, getter(lhs, 2) * getter(rhs, 0) - getter(rhs, 2) * getter(lhs, 0));
+		setter(output, 2, getter(lhs, 0) * getter(rhs, 1) - getter(rhs, 0) * getter(lhs, 1));
+
 		return output;
 #endif
 	}
 };
 
-
-template<typename T, typename U NBL_STRUCT_CONSTRAINABLE>
-struct lerp_helper;
-
 #ifdef __HLSL_VERSION
-#define MIX_FUNCTION spirv::fMix
+// SPIR-V already defines specializations for builtin vector types
+#define VECTOR_SPECIALIZATION_CONCEPT concepts::Vectorial<T> && !is_vector_v<T>
 #else
-#define MIX_FUNCTION glm::mix
+#define VECTOR_SPECIALIZATION_CONCEPT concepts::Vectorial<T>
 #endif
 
-#define DEFINE_LERP_HELPER_COMMON_SPECIALIZATION(TYPE)\
-template<>\
-struct lerp_helper<TYPE, TYPE>\
-{\
-	static inline TYPE lerp(NBL_CONST_REF_ARG(TYPE) x, NBL_CONST_REF_ARG(TYPE) y, NBL_CONST_REF_ARG(TYPE) a)\
-	{\
-		return MIX_FUNCTION(x, y, a);\
-	}\
-};\
-\
-template<int N>\
-struct lerp_helper<vector<TYPE, N>, vector<TYPE, N> >\
-{\
-	static inline vector<TYPE, N> lerp(NBL_CONST_REF_ARG(vector<TYPE, N>) x, NBL_CONST_REF_ARG(vector<TYPE, N>) y, NBL_CONST_REF_ARG(vector<TYPE, N>) a)\
-	{\
-		return MIX_FUNCTION(x, y, a);\
-	}\
-};\
-\
-template<int N>\
-struct lerp_helper<vector<TYPE, N>, TYPE>\
-{\
-	static inline vector<TYPE, N> lerp(NBL_CONST_REF_ARG(vector<TYPE, N>) x, NBL_CONST_REF_ARG(vector<TYPE, N>) y, NBL_CONST_REF_ARG(TYPE) a)\
-	{\
-		return MIX_FUNCTION(x, y, a);\
-	}\
-};\
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct clamp_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
+{
+	using return_t = T;
+	static return_t __call(NBL_CONST_REF_ARG(T) val, NBL_CONST_REF_ARG(T) min, NBL_CONST_REF_ARG(T) max)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<return_t, typename traits::scalar_type> setter;
 
-DEFINE_LERP_HELPER_COMMON_SPECIALIZATION(float32_t)
-DEFINE_LERP_HELPER_COMMON_SPECIALIZATION(float64_t)
+		return_t output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, clamp_helper<typename traits::scalar_type>::__call(getter(val, i), getter(min, i), getter(max, i)));
 
-#undef DEFINE_LERP_HELPER_COMMON_SPECIALIZATION
-#undef MIX_FUNCTION
+		return output;
+	}
+};
 
 template<typename T>
-struct lerp_helper<T, bool>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct min_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
 {
-	static inline T lerp(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y, NBL_CONST_REF_ARG(bool) a)
+	static T __call(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b)
 	{
-		if (a)
-			return y;
-		else
-			return x;
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<T, typename traits::scalar_type> setter;
+
+		T output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, min_helper<typename traits::scalar_type>::__call(getter(a, i), getter(b, i)));
+
+		return output;
+	}
+};
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct max_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
+{
+	static T __call(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<T, typename traits::scalar_type> setter;
+
+		T output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, max_helper<typename traits::scalar_type>::__call(getter(a, i), getter(b, i)));
+
+		return output;
 	}
 };
 
-template<typename T, int N>
-struct lerp_helper<vector<T, N>, vector<bool, N> >
+template<typename LhsT, typename RhsT>
+NBL_PARTIAL_REQ_TOP(concepts::Matrix<LhsT> && concepts::Vector<RhsT> && (matrix_traits<LhsT>::ColumnCount == vector_traits<RhsT>::Dimension))
+struct mul_helper<LhsT, RhsT NBL_PARTIAL_REQ_BOT(concepts::Matrix<LhsT> && concepts::Vector<RhsT> && (matrix_traits<LhsT>::ColumnCount == vector_traits<RhsT>::Dimension)) >
 {
-	using output_vec_t = vector<T, N>;
+	using lhs_traits = matrix_traits<LhsT>;
+	using rhs_traits = vector_traits<RhsT>;
+	using return_t = vector<typename lhs_traits::scalar_type, lhs_traits::RowCount>;
+	static inline return_t __call(LhsT lhs, RhsT rhs)
+	{
+		return mul(lhs, rhs);
+	}
+};
 
-	static inline output_vec_t lerp(NBL_CONST_REF_ARG(output_vec_t) x, NBL_CONST_REF_ARG(output_vec_t) y, NBL_CONST_REF_ARG(vector<bool, N>) a)
+template<typename LhsT, typename RhsT>
+NBL_PARTIAL_REQ_TOP(concepts::Matrix<LhsT> && concepts::Matrix<RhsT> && (matrix_traits<LhsT>::ColumnCount == matrix_traits<RhsT>::RowCount))
+struct mul_helper<LhsT, RhsT NBL_PARTIAL_REQ_BOT(concepts::Matrix<LhsT> && concepts::Matrix<RhsT> && (matrix_traits<LhsT>::ColumnCount == matrix_traits<RhsT>::RowCount)) >
+{
+	using lhs_traits = matrix_traits<LhsT>;
+	using rhs_traits = matrix_traits<RhsT>;
+	using return_t = matrix<typename lhs_traits::scalar_type, lhs_traits::RowCount, rhs_traits::ColumnCount>;
+	static inline return_t __call(LhsT lhs, RhsT rhs)
 	{
-		output_vec_t retval;
-		for (uint32_t i = 0; i < vector_traits<output_vec_t>::Dimension; i++)
-			retval[i] = a[i] ? y[i] : x[i];
-		return retval;
+		return mul(lhs, rhs);
 	}
 };
 
-template<typename Matrix>
-struct transpose_helper;
+#define AUTO_SPECIALIZE_HELPER_FOR_VECTOR(HELPER_NAME, REQUIREMENT, RETURN_TYPE)\
+template<typename T>\
+NBL_PARTIAL_REQ_TOP(REQUIREMENT)\
+struct HELPER_NAME<T NBL_PARTIAL_REQ_BOT(REQUIREMENT) >\
+{\
+	using return_t = RETURN_TYPE;\
+	static return_t __call(NBL_CONST_REF_ARG(T) vec)\
+	{\
+		using traits = hlsl::vector_traits<T>;\
+		using return_t_traits = hlsl::vector_traits<return_t>;\
+		array_get<T, typename traits::scalar_type> getter;\
+		array_set<return_t, typename return_t_traits::scalar_type> setter;\
+\
+		return_t output;\
+		for (uint32_t i = 0; i < traits::Dimension; ++i)\
+			setter(output, i, HELPER_NAME<typename traits::scalar_type>::__call(getter(vec, i)));\
+\
+		return output;\
+	}\
+};
 
-template<typename T, int N, int M>
-struct transpose_helper<matrix<T, N, M> >
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(rsqrt_helper, concepts::FloatingPointVectorial<T> && VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(bitReverse_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(frac_helper, VECTOR_SPECIALIZATION_CONCEPT,T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(sign_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(degrees_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(radians_helper, VECTOR_SPECIALIZATION_CONCEPT, T)
+#define INT32_VECTOR_TYPE vector<int32_t, hlsl::vector_traits<T>::Dimension>
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(bitCount_helper, VECTOR_SPECIALIZATION_CONCEPT, INT32_VECTOR_TYPE)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(find_msb_helper, VECTOR_SPECIALIZATION_CONCEPT, INT32_VECTOR_TYPE)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(find_lsb_helper, VECTOR_SPECIALIZATION_CONCEPT, INT32_VECTOR_TYPE)
+#undef INT32_VECTOR_TYPE
+#undef AUTO_SPECIALIZE_HELPER_FOR_VECTOR
+
+template<typename BooleanVector>
+NBL_PARTIAL_REQ_TOP(concepts::Vectorial<BooleanVector> && is_same_v<typename vector_traits<BooleanVector>::scalar_type, bool>)
+struct all_helper<BooleanVector NBL_PARTIAL_REQ_BOT(concepts::Vectorial<BooleanVector> && is_same_v<typename vector_traits<BooleanVector>::scalar_type, bool>) >
 {
-	using transposed_t = typename matrix_traits<matrix<T, N, M> >::transposed_type;
+	static bool __call(NBL_CONST_REF_ARG(BooleanVector) x)
+	{
+		using traits = hlsl::vector_traits<BooleanVector>;
+		array_get<BooleanVector, typename traits::scalar_type> getter;
+		array_set<BooleanVector, typename traits::scalar_type> setter;
+
+		bool output = true;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			output = output && getter(x, i);
+
+		return output;
+	}
+};
 
-	static transposed_t transpose(NBL_CONST_REF_ARG(matrix<T, N, M>) m)
+template<typename BooleanVector>
+NBL_PARTIAL_REQ_TOP(concepts::Vectorial<BooleanVector> && is_same_v<typename vector_traits<BooleanVector>::scalar_type, bool>)
+struct any_helper<BooleanVector NBL_PARTIAL_REQ_BOT(concepts::Vectorial<BooleanVector> && is_same_v<typename vector_traits<BooleanVector>::scalar_type, bool>) >
+{
+	static bool __call(NBL_CONST_REF_ARG(BooleanVector) x)
 	{
-#ifdef __HLSL_VERSION
-		return spirv::transpose(m);
-#else
-		return reinterpret_cast<transposed_t&>(glm::transpose(reinterpret_cast<typename matrix<T, N, M>::Base const&>(m)));
-#endif
+		using traits = hlsl::vector_traits<BooleanVector>;
+		array_get<BooleanVector, typename traits::scalar_type> getter;
+		array_set<BooleanVector, typename traits::scalar_type> setter;
+
+		bool output = false;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			output = output || getter(x, i);
+
+		return output;
 	}
 };
 
-template<typename LhsT, typename RhsT>
-struct mul_helper
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct step_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
 {
-	static inline RhsT multiply(LhsT lhs, RhsT rhs)
+	using return_t = T;
+	static return_t __call(NBL_CONST_REF_ARG(T) edge, NBL_CONST_REF_ARG(T) x)
 	{
-		return mul(lhs, rhs);
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<return_t, typename traits::scalar_type> setter;
+		
+		return_t output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, step_helper<typename traits::scalar_type>::__call(getter(edge, i), getter(x, i)));
+		
+		return output;
+	}
+};
+
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct smoothStep_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
+{
+	using return_t = T;
+	static return_t __call(NBL_CONST_REF_ARG(T) edge0, NBL_CONST_REF_ARG(T) edge1, NBL_CONST_REF_ARG(T) x)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<return_t, typename traits::scalar_type> setter;
+
+		return_t output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, smoothStep_helper<typename traits::scalar_type>::__call(getter(edge0, i), getter(edge1, i), getter(x, i)));
+
+		return output;
 	}
 };
 
diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h
new file mode 100644
index 0000000000..e7d98d42f2
--- /dev/null
+++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h
@@ -0,0 +1,151 @@
+#ifndef _NBL_BUILTIN_HLSL_CPP_COMPAT_INTRINSICS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CPP_COMPAT_INTRINSICS_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+// this is a C++ only header, hence the `.h` extension, it only implements HLSL's built-in functions
+#ifndef __HLSL_VERSION
+#include <algorithm>
+#include <cmath>
+#include "nbl/core/util/bitflag.h"
+
+namespace nbl::hlsl
+{
+// TODO: remove this macro and write stuff by hand, the aliasing stuff doesn't work
+#define NBL_SIMPLE_GLM_PASSTHROUGH(HLSL_ID,GLSL_ID,...) template<typename... Args>\
+inline auto HLSL_ID(Args&&... args) \
+{ \
+    return glm::GLSL_ID(std::forward<Args>(args)...);\
+}
+#define NBL_BIT_OP_GLM_PASSTHROUGH(HLSL_ID,GLSL_ID) template<typename T> \
+inline auto HLSL_ID(const T bitpattern) \
+{ \
+    if constexpr (std::is_integral_v<T>) \
+        return glm::GLSL_ID(bitpattern); \
+    else \
+    { \
+        if constexpr (std::is_enum_v<T>) \
+        { \
+            const auto as_underlying = static_cast<std::underlying_type_t<T>>(bitpattern); \
+            return glm::GLSL_ID(as_underlying); \
+        } \
+        else \
+        { \
+            if constexpr (std::is_same_v<T,core::bitflag<typename T::enum_t>>) \
+                return HLSL_ID<typename T::enum_t>(bitpattern.value); \
+        } \
+    } \
+}
+
+NBL_BIT_OP_GLM_PASSTHROUGH(bitCount,bitCount)
+
+NBL_SIMPLE_GLM_PASSTHROUGH(cross,cross)
+NBL_SIMPLE_GLM_PASSTHROUGH(clamp,clamp)
+NBL_SIMPLE_GLM_PASSTHROUGH(normalize, normalize)
+
+template<typename T>
+inline scalar_type_t<T> length(const T& vec)
+{
+    return glm::length(vec);
+}
+
+template<typename T>
+inline scalar_type_t<T> dot(const T& lhs, const T& rhs)
+{
+    scalar_type_t<T> retval = lhs[0]*rhs[0];
+    // whatever has a `scalar_type` specialization should be a pure vector
+    for (auto i=1; i<sizeof(T)/sizeof(retval); i++)
+        retval += lhs[i]*rhs[i];
+    return retval;
+}
+
+// determinant not defined cause its implemented via hidden friend
+// https://stackoverflow.com/questions/67459950/why-is-a-friend-function-not-treated-as-a-member-of-a-namespace-of-a-class-it-wa
+template<typename T, uint16_t N, uint16_t M>
+inline T determinant(const matrix<T,N,M>& m)
+{
+    return glm::determinant(reinterpret_cast<typename matrix<T,N,M>::Base const&>(m));
+}
+
+NBL_BIT_OP_GLM_PASSTHROUGH(findLSB,findLSB)
+
+NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findMSB)
+
+// TODO: some of the functions in this header should move to `tgmath`
+template<typename T> requires ::nbl::hlsl::is_floating_point_v<T>
+inline T floor(const T& v)
+{
+    return glm::floor(v);
+}
+
+
+// inverse not defined cause its implemented via hidden friend
+template<typename T, uint16_t N, uint16_t M>
+inline matrix<T,N,M> inverse(const matrix<T,N,M>& m)
+{
+    static_assert(!(N == 3 && M == 4));
+    return reinterpret_cast<matrix<T,N,M>&>(glm::inverse(reinterpret_cast<typename matrix<T,N,M>::Base const&>(m)));
+}
+
+template<typename T, typename U>
+inline T lerp(const T& x, const T& y, const U& a)
+{
+    if constexpr (std::is_same_v<U,bool>)
+        return a ? y:x;
+    else
+    {
+        if constexpr (std::is_same_v<scalar_type_t<U>,bool>)
+        {
+            T retval;
+            // whatever has a `scalar_type` specialization should be a pure vector
+            for (auto i=0; i<sizeof(a)/sizeof(scalar_type_t<U>); i++)
+                retval[i] = a[i] ? y[i]:x[i];
+            return retval;
+        }
+        else
+            return glm::mix<T,U>(x,y,a);
+    }
+}
+
+// transpose not defined cause its implemented via hidden friend
+template<typename T, uint16_t N, uint16_t M>
+inline matrix<T,M,N> transpose(const matrix<T,N,M>& m)
+{
+    return reinterpret_cast<matrix<T,M,N>&>(glm::transpose(reinterpret_cast<typename matrix<T,N,M>::Base const&>(m)));
+}
+
+#undef NBL_BIT_OP_GLM_PASSTHROUGH
+#undef NBL_SIMPLE_GLM_PASSTHROUGH
+
+// TODO: remove this macro and write stuff by hand, the aliasing stuff doesn't work
+#define NBL_ALIAS_TEMPLATE_FUNCTION(origFunctionName, functionAlias) \
+template<typename... Args> \
+inline auto functionAlias(Args&&... args) -> decltype(origFunctionName(std::forward<Args>(args)...)) \
+{ \
+    return origFunctionName(std::forward<Args>(args)...); \
+}
+
+NBL_ALIAS_TEMPLATE_FUNCTION(std::min, min);
+
+template<typename T>
+inline T max(const T& a, const T& b)
+{
+    return lerp<T>(a,b,b>a);
+}
+
+NBL_ALIAS_TEMPLATE_FUNCTION(std::isnan, isnan);
+NBL_ALIAS_TEMPLATE_FUNCTION(std::isinf, isinf);
+NBL_ALIAS_TEMPLATE_FUNCTION(std::exp2, exp2);
+
+template<typename T>
+inline T rsqrt(T x)
+{
+    return 1.0f / std::sqrt(x);
+}
+
+
+}
+#endif
+
+#endif
diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl
index 013424d765..77e55221c7 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl
@@ -8,6 +8,9 @@
 #include <nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl>
 #include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
 #include <nbl/builtin/hlsl/ieee754.hlsl>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
+#include <nbl/builtin/hlsl/concepts/vector.hlsl>
+#include <nbl/builtin/hlsl/concepts/matrix.hlsl>
 
 #ifndef __HLSL_VERSION
 #include <algorithm>
@@ -20,221 +23,293 @@ namespace nbl
 namespace hlsl
 {
 
-template<typename Integer>
-inline int bitCount(NBL_CONST_REF_ARG(Integer) val)
+template<typename T>
+inline typename cpp_compat_intrinsics_impl::bitCount_helper<T>::return_t bitCount(NBL_CONST_REF_ARG(T) val)
 {
-#ifdef __HLSL_VERSION
-	if (sizeof(Integer) == 8u)
-	{
-		uint32_t lowBits = uint32_t(val);
-		uint32_t highBits = uint32_t(uint64_t(val) >> 32u);
+	return cpp_compat_intrinsics_impl::bitCount_helper<T>::__call(val);
+}
+
+template<typename T>
+T cross(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs)
+{
+	return cpp_compat_intrinsics_impl::cross_helper<T>::__call(lhs, rhs);
+}
 
-		return countbits(lowBits) + countbits(highBits);
-	}
+template<typename T>
+typename cpp_compat_intrinsics_impl::clamp_helper<T>::return_t clamp(NBL_CONST_REF_ARG(T) val, NBL_CONST_REF_ARG(T) _min, NBL_CONST_REF_ARG(T) _max)
+{
+	return cpp_compat_intrinsics_impl::clamp_helper<T>::__call(val, _min, _max);
+}
 
-	return countbits(val);
+template<typename FloatingPointVectorial>
+typename vector_traits<FloatingPointVectorial>::scalar_type length(NBL_CONST_REF_ARG(FloatingPointVectorial) vec)
+{
+	return cpp_compat_intrinsics_impl::length_helper<FloatingPointVectorial>::__call(vec);
+}
 
-#else
-	return glm::bitCount(val);
-#endif
+template<typename FloatingPointVectorial>
+FloatingPointVectorial normalize(NBL_CONST_REF_ARG(FloatingPointVectorial) vec)
+{
+	return cpp_compat_intrinsics_impl::normalize_helper<FloatingPointVectorial>::__call(vec);
+}
+
+template<typename Vectorial>
+typename vector_traits<Vectorial>::scalar_type dot(NBL_CONST_REF_ARG(Vectorial) lhs, NBL_CONST_REF_ARG(Vectorial) rhs)
+{
+	return cpp_compat_intrinsics_impl::dot_helper<Vectorial>::__call(lhs, rhs);
+}
+
+// determinant not defined cause its implemented via hidden friend
+// https://stackoverflow.com/questions/67459950/why-is-a-friend-function-not-treated-as-a-member-of-a-namespace-of-a-class-it-wa
+template<typename Matrix NBL_FUNC_REQUIRES(concepts::Matricial<Matrix>)
+inline typename matrix_traits<Matrix>::scalar_type determinant(NBL_CONST_REF_ARG(Matrix) mat)
+{
+	return cpp_compat_intrinsics_impl::determinant_helper<Matrix>::__call(mat);
 }
 
 template<typename T>
-vector<T, 3> cross(NBL_CONST_REF_ARG(vector<T, 3>) lhs, NBL_CONST_REF_ARG(vector<T, 3>) rhs)
+inline typename cpp_compat_intrinsics_impl::find_lsb_helper<T>::return_t findLSB(NBL_CONST_REF_ARG(T) val)
 {
-#ifdef __HLSL_VERSION
-	return spirv::cross(lhs, rhs);
-#else
-	return glm::cross(lhs, rhs);
-#endif
+	return cpp_compat_intrinsics_impl::find_lsb_helper<T>::__call(val);
 }
 
 template<typename T>
-T clamp(NBL_CONST_REF_ARG(T) val, NBL_CONST_REF_ARG(T) min, NBL_CONST_REF_ARG(T) max)
+inline typename cpp_compat_intrinsics_impl::find_msb_helper<T>::return_t findMSB(NBL_CONST_REF_ARG(T) val)
 {
-#ifdef __HLSL_VERSION
-	return clamp(val, min, max);
-#else
-	return glm::clamp(val, min, max);
-#endif
+	return cpp_compat_intrinsics_impl::find_msb_helper<T>::__call(val);
+}
+
+// inverse not defined cause its implemented via hidden friend
+template<typename Matrix NBL_FUNC_REQUIRES(concepts::Matricial<Matrix>)
+inline Matrix inverse(NBL_CONST_REF_ARG(Matrix) mat)
+{
+	return cpp_compat_intrinsics_impl::inverse_helper<Matrix>::__call(mat);
+}
+
+// transpose not defined cause its implemented via hidden friend
+template<typename Matrix NBL_FUNC_REQUIRES(concepts::Matricial<Matrix>)
+inline typename matrix_traits<Matrix>::transposed_type transpose(NBL_CONST_REF_ARG(Matrix) m)
+{
+	return cpp_compat_intrinsics_impl::transpose_helper<Matrix>::__call(m);
+}
+
+template<typename LhsT, typename RhsT>
+inline typename cpp_compat_intrinsics_impl::mul_helper<LhsT, RhsT>::return_t mul(LhsT lhs, RhsT rhs)
+{
+	return cpp_compat_intrinsics_impl::mul_helper<LhsT, RhsT>::__call(lhs, rhs);
 }
 
 template<typename T>
-typename vector_traits<T>::scalar_type dot(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs)
+inline T min(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b)
 {
-	return cpp_compat_intrinsics_impl::dot_helper<T>::dot_product(lhs, rhs);
+	return cpp_compat_intrinsics_impl::min_helper<T>::__call(a, b);
 }
 
-// TODO: for clearer error messages, use concepts to ensure that input type is a square matrix
-// determinant not defined cause its implemented via hidden friend
-// https://stackoverflow.com/questions/67459950/why-is-a-friend-function-not-treated-as-a-member-of-a-namespace-of-a-class-it-wa
-template<typename T, uint16_t N>
-inline T determinant(NBL_CONST_REF_ARG(matrix<T, N, N>) m)
+template<typename T>
+inline T max(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b)
 {
-#ifdef __HLSL_VERSION
-	spirv::determinant(m);
-#else
-	return glm::determinant(reinterpret_cast<typename matrix<T, N, N>::Base const&>(m));
-#endif
+	return cpp_compat_intrinsics_impl::max_helper<T>::__call(a, b);
 }
 
-template<typename Integer>
-inline typename cpp_compat_intrinsics_impl::find_lsb_return_type<Integer>::type findLSB(NBL_CONST_REF_ARG(Integer) val)
+template<typename FloatingPoint>
+inline FloatingPoint rsqrt(FloatingPoint x)
 {
-	return cpp_compat_intrinsics_impl::find_lsb_helper<Integer>::findLSB(val);
+	return cpp_compat_intrinsics_impl::rsqrt_helper<FloatingPoint>::__call(x);
 }
 
 template<typename Integer>
-inline typename cpp_compat_intrinsics_impl::find_msb_return_type<Integer>::type findMSB(NBL_CONST_REF_ARG(Integer) val)
+inline Integer bitReverse(Integer val)
 {
-	return cpp_compat_intrinsics_impl::find_msb_helper<Integer>::findMSB(val);
+	return cpp_compat_intrinsics_impl::bitReverse_helper<Integer>::__call(val);
 }
 
-// TODO: some of the functions in this header should move to `tgmath`
-template<typename T>
-inline T floor(NBL_CONST_REF_ARG(T) val)
+template<typename T, uint16_t Bits>
+/**
+* @brief Takes the binary representation of `value` as a string of `Bits` bits and returns a value of the same type resulting from reversing the string
+*
+* @tparam T Type of the value to operate on.
+* @tparam Bits The length of the string of bits used to represent `value`.
+*
+* @param [in] value The value to bitreverse.
+*/
+inline T bitReverseAs(T val)
 {
-#ifdef __HLSL_VERSION
-	return spirv::floor(val);
-#else
-	return glm::floor(val);
-#endif
-	
+	return cpp_compat_intrinsics_impl::bitReverseAs_helper<T, Bits>::__call(val);
 }
 
-// TODO: for clearer error messages, use concepts to ensure that input type is a square matrix
-// inverse not defined cause its implemented via hidden friend
-template<typename T, uint16_t N>
-inline matrix<T, N, N> inverse(NBL_CONST_REF_ARG(matrix<T, N, N>) m)
+template<typename T NBL_FUNC_REQUIRES(is_unsigned_v<T>)
+/**
+* @brief Takes the binary representation of `value` and returns a value of the same type resulting from reversing the string of bits as if it was `bits` long.
+* Keep in mind `bits` cannot exceed `8 * sizeof(T)`.
+*
+* @tparam T type of the value to operate on.
+*
+* @param [in] value The value to bitreverse.
+* @param [in] bits The length of the string of bits used to represent `value`.
+*/
+T bitReverseAs(T val, uint16_t bits)
 {
-#ifdef __HLSL_VERSION
-	return spirv::matrixInverse(m);
-#else
-	return reinterpret_cast<matrix<T, N, N>&>(glm::inverse(reinterpret_cast<typename matrix<T, N, N>::Base const&>(m)));
-#endif
+	return cpp_compat_intrinsics_impl::bitReverseAs_helper<T, 0>::__call(val, bits);
+}
+
+template<typename Vector>
+inline bool all(Vector vec)
+{
+	return cpp_compat_intrinsics_impl::all_helper<Vector>::__call(vec);
+}
+
+template<typename Vector>
+inline bool any(Vector vec)
+{
+	return cpp_compat_intrinsics_impl::any_helper<Vector>::__call(vec);
+}
+
+/**
+* @brief Returns x - floor(x).
+*
+* @tparam T type of the value to operate on.
+*
+* @param [in] val The value to operate on.
+*/
+template<typename T>
+inline T frac(NBL_CONST_REF_ARG(T) val)
+{
+	return cpp_compat_intrinsics_impl::frac_helper<T>::__call(val);
 }
 
 template<typename T, typename U>
-inline T lerp(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y, NBL_CONST_REF_ARG(U) a)
+inline T mix(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y, NBL_CONST_REF_ARG(U) a)
 {
-	return cpp_compat_intrinsics_impl::lerp_helper<T, U>::lerp(x, y, a);
+	return cpp_compat_intrinsics_impl::mix_helper<T, U>::__call(x, y, a);
 }
 
-// transpose not defined cause its implemented via hidden friend
-template<typename Matrix>
-inline typename matrix_traits<Matrix>::transposed_type transpose(NBL_CONST_REF_ARG(Matrix) m)
+template<typename T>
+inline T sign(NBL_CONST_REF_ARG(T) val)
 {
-	return cpp_compat_intrinsics_impl::transpose_helper<Matrix>::transpose(m);
+	return cpp_compat_intrinsics_impl::sign_helper<T>::__call(val);
 }
 
-// TODO: concepts, to ensure that MatT is a matrix and VecT is a vector type
-template<typename MatT, typename VecT>
-VecT mul(MatT mat, VecT vec)
+template<typename T>
+inline T radians(NBL_CONST_REF_ARG(T) degrees)
 {
-	return cpp_compat_intrinsics_impl::mul_helper<MatT, VecT>::multiply(mat, vec);
+	return cpp_compat_intrinsics_impl::radians_helper<T>::__call(degrees);
 }
 
 template<typename T>
-inline T min(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b)
+inline T degrees(NBL_CONST_REF_ARG(T) radians)
 {
-#ifdef __HLSL_VERSION
-	min(a, b);
-#else
-	return glm::min(a, b);
-#endif
+	return cpp_compat_intrinsics_impl::degrees_helper<T>::__call(radians);
 }
 
 template<typename T>
-inline T max(NBL_CONST_REF_ARG(T) a, NBL_CONST_REF_ARG(T) b)
+inline T step(NBL_CONST_REF_ARG(T) edge, NBL_CONST_REF_ARG(T) x)
 {
-#ifdef __HLSL_VERSION
-	max(a, b);
-#else
-	return glm::max(a, b);
-#endif
+	return cpp_compat_intrinsics_impl::step_helper<T>::__call(edge, x);
 }
 
-template<typename FloatingPoint NBL_FUNC_REQUIRES(hlsl::is_floating_point_v<FloatingPoint>)
-inline bool isnan(NBL_CONST_REF_ARG(FloatingPoint) val)
+template<typename T>
+inline T smoothStep(NBL_CONST_REF_ARG(T) edge0, NBL_CONST_REF_ARG(T) edge1, NBL_CONST_REF_ARG(T) x)
 {
-#ifdef __HLSL_VERSION
-	return spirv::isNan(val);
-#else
-	return std::isnan(val);
-#endif
+	return cpp_compat_intrinsics_impl::smoothStep_helper<T>::__call(edge0, edge1, x);
 }
 
-template <typename Integer NBL_FUNC_REQUIRES(hlsl::is_integral_v<Integer>)
-inline bool isnan(Integer val)
+template<typename T>
+inline T faceForward(NBL_CONST_REF_ARG(T) N, NBL_CONST_REF_ARG(T) I, NBL_CONST_REF_ARG(T) Nref)
 {
-	using AsUint = typename unsigned_integer_of_size<sizeof(Integer)>::type;
-	using AsFloat = typename float_of_size<sizeof(Integer)>::type;
+	return cpp_compat_intrinsics_impl::faceForward_helper<T>::__call(N, I, Nref);
+}
 
-	AsUint asUint = bit_cast<AsUint, Integer>(val);
-	return bool((ieee754::extractBiasedExponent<Integer>(val) == ieee754::traits<AsFloat>::specialValueExp) && (asUint & ieee754::traits<AsFloat>::mantissaMask));
+template<typename T>
+inline T reflect(NBL_CONST_REF_ARG(T) I, NBL_CONST_REF_ARG(T) N)
+{
+	return cpp_compat_intrinsics_impl::reflect_helper<T>::__call(I, N);
 }
 
-template<typename FloatingPoint NBL_FUNC_REQUIRES(hlsl::is_floating_point_v<FloatingPoint>)
-inline FloatingPoint isinf(NBL_CONST_REF_ARG(FloatingPoint) val)
+template<typename T, typename U>
+inline T refract(NBL_CONST_REF_ARG(T) I, NBL_CONST_REF_ARG(T) N, NBL_CONST_REF_ARG(U) eta)
 {
+	return cpp_compat_intrinsics_impl::refract_helper<T, U>::__call(I, N, eta);
+}
+
 #ifdef __HLSL_VERSION
-	return spirv::isInf(val);
+#define NAMESPACE spirv
 #else
-	return std::isinf(val);
+#define NAMESPACE glm
 #endif
+
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, float32_t4>)
+inline int32_t packSnorm4x8(T vec)
+{
+	return NAMESPACE::packSnorm4x8(vec);
 }
 
-template<typename Integer NBL_FUNC_REQUIRES(hlsl::is_integral_v<Integer>)
-inline bool isinf(Integer val)
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, float32_t4>)
+inline int32_t packUnorm4x8(T vec)
 {
-	using AsUint = typename unsigned_integer_of_size<sizeof(Integer)>::type;
-	using AsFloat = typename float_of_size<sizeof(Integer)>::type;
+	return NAMESPACE::packUnorm4x8(vec);
+}
 
-	AsUint tmp = bit_cast<AsUint>(val);
-	return (tmp & (~ieee754::traits<AsFloat>::signMask)) == ieee754::traits<AsFloat>::inf;
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, float32_t2>)
+inline int32_t packSnorm2x16(T vec)
+{
+	return NAMESPACE::packSnorm2x16(vec);
 }
 
-template<typename  T>
-inline T exp2(NBL_CONST_REF_ARG(T) val)
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, float32_t2>)
+inline int32_t packUnorm2x16(T vec)
 {
-#ifdef __HLSL_VERSION
-	return spirv::exp2(val);
-#else
-	return std::exp2(val);
-#endif
+	return NAMESPACE::packUnorm2x16(vec);
 }
 
-#define DEFINE_EXP2_SPECIALIZATION(TYPE)\
-template<>\
-inline TYPE exp2(NBL_CONST_REF_ARG(TYPE) val)\
-{\
-	return _static_cast<TYPE>(1ull << val);\
-}\
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, float32_t2>)
+inline int32_t packHalf2x16(T vec)
+{
+	return NAMESPACE::packHalf2x16(vec);
+}
 
-DEFINE_EXP2_SPECIALIZATION(int16_t)
-DEFINE_EXP2_SPECIALIZATION(int32_t)
-DEFINE_EXP2_SPECIALIZATION(int64_t)
-DEFINE_EXP2_SPECIALIZATION(uint16_t)
-DEFINE_EXP2_SPECIALIZATION(uint32_t)
-DEFINE_EXP2_SPECIALIZATION(uint64_t)
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, int32_t2>)
+inline float64_t packDouble2x32(T vec)
+{
+	return NAMESPACE::packDouble2x32(vec);
+}
 
-template<typename FloatingPoint>
-inline FloatingPoint rsqrt(FloatingPoint x)
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, int32_t>)
+inline float32_t2 unpackSnorm2x16(T val)
 {
-	// TODO: https://stackoverflow.com/a/62239778
-#ifdef __HLSL_VERSION
-	return spirv::inverseSqrt(x);
-#else
-	return 1.0f / std::sqrt(x);
-#endif
+	return NAMESPACE::unpackSnorm2x16(val);
 }
 
-template<typename Integer>
-inline Integer bitReverse(Integer val)
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, int32_t>)
+inline float32_t2 unpackUnorm2x16(T val)
 {
-	return cpp_compat_intrinsics_impl::bitReverse_helper<Integer>::__call(val);
+	return NAMESPACE::unpackUnorm2x16(val);
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, int32_t>)
+inline float32_t2 unpackHalf2x16(T val)
+{
+	return NAMESPACE::unpackHalf2x16(val);
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, int32_t>)
+inline float32_t4 unpackSnorm4x8(T val)
+{
+	return NAMESPACE::unpackSnorm4x8(val);
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, int32_t>)
+inline float32_t4 unpackUnorm4x8(T val)
+{
+	return NAMESPACE::unpackUnorm4x8(val);
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_same_v<T, float64_t>)
+inline int32_t2 unpackDouble2x32(T val)
+{
+	return NAMESPACE::unpackDouble2x32(val);
 }
 
+#undef NAMESPACE
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
index b1d33f097b..b04bd6c7e0 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/matrix.hlsl
@@ -14,6 +14,7 @@ struct matrix final : private glm::mat<N,M,T>
     using Base = glm::mat<N,M,T>;
     using Base::Base;
     using Base::operator[];
+    //using type = matrix<T, N, M>;
 
     // For assigning to same dimension use implicit ctor, and even then only allow for dimension truncation
     template<uint16_t X, uint16_t Y> requires ((X!=N || Y!=M) && X>=N && Y>=M)
@@ -27,7 +28,7 @@ struct matrix final : private glm::mat<N,M,T>
         Base::operator=(rhs);
         return *this;
     }
-
+    
     friend matrix operator+(matrix const& lhs, matrix const& rhs){ return matrix(reinterpret_cast<Base const&>(lhs) + reinterpret_cast<Base const&>(rhs)); }
     friend matrix operator-(matrix const& lhs, matrix const& rhs){ return matrix(reinterpret_cast<Base const&>(lhs) - reinterpret_cast<Base const&>(rhs)); }
 
diff --git a/include/nbl/builtin/hlsl/cpp_compat/unroll.hlsl b/include/nbl/builtin/hlsl/cpp_compat/unroll.hlsl
new file mode 100644
index 0000000000..36bcd944c6
--- /dev/null
+++ b/include/nbl/builtin/hlsl/cpp_compat/unroll.hlsl
@@ -0,0 +1,12 @@
+#ifndef _NBL_BUILTIN_HLSL_CPP_COMPAT_UNROLL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CPP_COMPAT_UNROLL_INCLUDED_
+
+#ifdef __HLSL_VERSION
+#define NBL_UNROLL [unroll]
+#define NBL_UNROLL_LIMITED(LIMIT) [unroll(LIMIT)]
+#else
+#define NBL_UNROLL // can't be bothered / TODO
+#define NBL_UNROLL_LIMITED(LIMIT)
+#endif
+
+#endif
diff --git a/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl b/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl
index 354937427a..a27b718bc8 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/vector.hlsl
@@ -2,7 +2,7 @@
 #define _NBL_BUILTIN_HLSL_CPP_COMPAT_VECTOR_INCLUDED_
 
 // stuff for C++
-#ifndef __HLSL_VERSION 
+#ifndef __HLSL_VERSION
 #include <stdint.h>
 #define IMATH_HALF_NO_LOOKUP_TABLE
 #include <half.h>
@@ -91,5 +91,6 @@ struct blake3_hasher::update_impl<hlsl::vector<T,N>,Dummy>
 };
 }
 #endif
+
 }
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl
index 240f7601e4..edf54b5b9f 100644
--- a/include/nbl/builtin/hlsl/emulated/float64_t.hlsl
+++ b/include/nbl/builtin/hlsl/emulated/float64_t.hlsl
@@ -2,6 +2,7 @@
 #define _NBL_BUILTIN_HLSL_EMULATED_FLOAT64_T_HLSL_INCLUDED_
 
 #include <nbl/builtin/hlsl/emulated/float64_t_impl.hlsl>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
 
 namespace nbl
 {
@@ -96,8 +97,8 @@ namespace hlsl
             {
                 if(!FastMath)
                 {
-                    const bool isRhsInf = hlsl::isinf(rhs.data);
-                    if (hlsl::isinf(data))
+                    const bool isRhsInf = tgmath_impl::isinf_uint_impl(rhs.data);
+                    if (tgmath_impl::isinf_uint_impl(data))
                     {
                         if (isRhsInf && ((data ^ rhs.data) & ieee754::traits<float64_t>::signMask))
                             return bit_cast<this_t>(ieee754::traits<float64_t>::quietNaN | ieee754::traits<float64_t>::signMask);
@@ -115,7 +116,7 @@ namespace hlsl
                  
                 if(!FastMath)
                 {
-                    if (hlsl::isinf(data))
+                    if (tgmath_impl::isinf_uint_impl(data))
                         return bit_cast<this_t>(ieee754::traits<float64_t>::inf | ieee754::extractSignPreserveBitPattern(max(data, rhs.data)));
                 }
 
@@ -225,9 +226,9 @@ namespace hlsl
                 uint64_t sign = (data ^ rhs.data) & ieee754::traits<float64_t>::signMask;
                 if (!FastMath)
                 {
-                    if (hlsl::isnan(data) || hlsl::isnan(rhs.data))
+                    if (tgmath_impl::isnan_uint_impl(data) || tgmath_impl::isnan_uint_impl(rhs.data))
                         return bit_cast<this_t>(ieee754::traits<float64_t>::quietNaN | sign);
-                    if (hlsl::isinf(data) || hlsl::isinf(rhs.data))
+                    if (tgmath_impl::isinf_uint_impl(data) || tgmath_impl::isinf_uint_impl(rhs.data))
                         return bit_cast<this_t>(ieee754::traits<float64_t>::inf | sign);
                     if (emulated_float64_t_impl::isZero(data) || emulated_float64_t_impl::isZero(rhs.data))
                         return bit_cast<this_t>(sign);
@@ -288,7 +289,7 @@ namespace hlsl
 
                 if(!FastMath)
                 {
-                    if (hlsl::isnan<uint64_t>(data) || hlsl::isnan<uint64_t>(rhs.data))
+                    if (tgmath_impl::isnan_uint_impl<uint64_t>(data) || tgmath_impl::isnan_uint_impl<uint64_t>(rhs.data))
                         return bit_cast<this_t>(ieee754::traits<float64_t>::quietNaN);
                     if (emulated_float64_t_impl::areBothZero(data, rhs.data))
                         return bit_cast<this_t>(ieee754::traits<float64_t>::quietNaN | sign);
@@ -296,9 +297,9 @@ namespace hlsl
                         return bit_cast<this_t>(ieee754::traits<float64_t>::inf | sign);
                     if (emulated_float64_t_impl::areBothInfinity(data, rhs.data))
                         return bit_cast<this_t>(ieee754::traits<float64_t>::quietNaN | ieee754::traits<float64_t>::signMask);
-                    if (hlsl::isinf(data))
+                    if (tgmath_impl::isinf_uint_impl(data))
                         return bit_cast<this_t>(ieee754::traits<float64_t>::inf | sign);
-                    if (hlsl::isinf(rhs.data))
+                    if (tgmath_impl::isinf_uint_impl(rhs.data))
                         return bit_cast<this_t>(sign);
                 }
 
@@ -345,7 +346,7 @@ namespace hlsl
         {
             if (!FastMath)
             {
-                if (hlsl::isnan<uint64_t>(data) || hlsl::isnan<uint64_t>(rhs.data))
+                if (tgmath_impl::isnan_uint_impl<uint64_t>(data) || tgmath_impl::isnan_uint_impl<uint64_t>(rhs.data))
                     return false;
                 if (emulated_float64_t_impl::areBothZero(data, rhs.data))
                     return true;
@@ -355,7 +356,7 @@ namespace hlsl
         }
         bool operator!=(this_t rhs) NBL_CONST_MEMBER_FUNC
         {
-            if (!FastMath && (hlsl::isnan<uint64_t>(data) || hlsl::isnan<uint64_t>(rhs.data)))
+            if (!FastMath && (tgmath_impl::isnan_uint_impl<uint64_t>(data) || tgmath_impl::isnan_uint_impl<uint64_t>(rhs.data)))
                 return false;
 
             return !(bit_cast<this_t>(data) == rhs);
@@ -370,14 +371,14 @@ namespace hlsl
         }
         bool operator<=(this_t rhs) NBL_CONST_MEMBER_FUNC 
         { 
-            if (!FastMath && (hlsl::isnan<uint64_t>(data) || hlsl::isnan<uint64_t>(rhs.data)))
+            if (!FastMath && (tgmath_impl::isnan_uint_impl<uint64_t>(data) || tgmath_impl::isnan_uint_impl<uint64_t>(rhs.data)))
                 return false;
 
             return !(bit_cast<this_t>(data) > bit_cast<this_t>(rhs.data));
         }
         bool operator>=(this_t rhs)
         {
-            if (!FastMath && (hlsl::isnan<uint64_t>(data) || hlsl::isnan<uint64_t>(rhs.data)))
+            if (!FastMath && (tgmath_impl::isnan_uint_impl<uint64_t>(data) || tgmath_impl::isnan_uint_impl<uint64_t>(rhs.data)))
                 return false;
 
             return !(bit_cast<this_t>(data) < bit_cast<this_t>(rhs.data));
@@ -497,7 +498,7 @@ struct static_cast_helper<To,emulated_float64_t<FastMath,FlushDenormToZero>,void
                     return bit_cast<To>(ieee754::traits<ToAsFloat>::inf);
                 if (exponent < ieee754::traits<ToAsFloat>::exponentMin)
                     return bit_cast<To>(-ieee754::traits<ToAsFloat>::inf);
-                if (hlsl::isnan(v.data))
+                if (tgmath_impl::isnan_uint_impl(v.data))
                     return bit_cast<To>(ieee754::traits<ToAsFloat>::quietNaN);
             }
 
@@ -574,12 +575,32 @@ DEFINE_BIT_CAST_SPEC(emulated_float64_t<false, true>);
 
 namespace ieee754
 {
+namespace impl
+{
+template<> NBL_CONSTEXPR_INLINE_FUNC uint64_t bitCastToUintType(emulated_float64_t<true, true> x) { return x.data; }
+template<> NBL_CONSTEXPR_INLINE_FUNC uint64_t bitCastToUintType(emulated_float64_t<false, false> x) { return x.data; }
+template<> NBL_CONSTEXPR_INLINE_FUNC uint64_t bitCastToUintType(emulated_float64_t<true, false> x) { return x.data; }
+template<> NBL_CONSTEXPR_INLINE_FUNC uint64_t bitCastToUintType(emulated_float64_t<false, true> x) { return x.data; }
+}
+
 IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t<true, true>);
 IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t<false, false>);
 IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t<true, false>);
 IMPLEMENT_IEEE754_FUNC_SPEC_FOR_EMULATED_F64_TYPE(emulated_float64_t<false, true>);
 }
 
+namespace concepts
+{
+namespace impl
+{
+template<bool FastMath, bool FlushDenormToZero>
+struct is_emulating_floating_point_scalar<emulated_float64_t<FastMath, FlushDenormToZero> >
+{
+    NBL_CONSTEXPR_STATIC_INLINE bool value = true;
+};
+}
+}
+
 }
 }
 
diff --git a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl
index 9703858605..c987079bfc 100644
--- a/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl
+++ b/include/nbl/builtin/hlsl/emulated/float64_t_impl.hlsl
@@ -5,6 +5,7 @@
 #include <nbl/builtin/hlsl/ieee754.hlsl>
 #include <nbl/builtin/hlsl/algorithm.hlsl>
 #include <nbl/builtin/hlsl/glsl_compat/core.hlsl>
+#include <nbl/builtin/hlsl/tgmath.hlsl>
 
 // TODO: when it will be possible, use this unions wherever they fit:
 /*
@@ -170,7 +171,7 @@ NBL_CONSTEXPR_INLINE_FUNC bool operatorLessAndGreaterCommonImplementation(uint64
 {
     if (!FastMath)
     {
-        if (hlsl::isnan<uint64_t>(lhs) || hlsl::isnan<uint64_t>(rhs))
+        if (tgmath_impl::isnan_uint_impl<uint64_t>(lhs) || tgmath_impl::isnan_uint_impl<uint64_t>(rhs))
             return false;
         if (emulated_float64_t_impl::areBothZero(lhs, rhs))
             return false;
diff --git a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl
index a199a162fd..5a0197f93a 100644
--- a/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl
+++ b/include/nbl/builtin/hlsl/emulated/matrix_t.hlsl
@@ -2,6 +2,7 @@
 #define _NBL_BUILTIN_HLSL_EMULATED_MATRIX_T_HLSL_INCLUDED_
 
 #include <nbl/builtin/hlsl/portable/float64_t.hlsl>
+#include <nbl/builtin/hlsl/emulated/vector_t.hlsl>
 #include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
 
 namespace nbl
@@ -54,17 +55,23 @@ template<typename T> \
 struct matrix_traits<emulated_matrix<T, ROW_COUNT, COLUMN_COUNT> > \
 { \
     using scalar_type = T; \
-    using row_type = vector<T, COLUMN_COUNT>; \
+    using row_type = emulated_vector_t<T, COLUMN_COUNT>; \
     using transposed_type = emulated_matrix<T, COLUMN_COUNT, ROW_COUNT>; \
     NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT; \
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT; \
     NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount; \
+    NBL_CONSTEXPR_STATIC_INLINE bool IsMatrix = true; \
 };
 
 DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 2)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 3)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 4)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 2)
 DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 3)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 4)
 DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 4)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 2)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 3)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 4)
 
 #undef DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION
 
@@ -75,25 +82,52 @@ struct transpose_helper<emulated_matrix<T, N, M> >
 {
     using transposed_t = typename matrix_traits<emulated_matrix<T, N, M> >::transposed_type;
 
-	static transposed_t transpose(NBL_CONST_REF_ARG(emulated_matrix<T, N, M>) m)
+	static transposed_t __call(NBL_CONST_REF_ARG(emulated_matrix<T, N, M>) m)
 	{
         return m.getTransposed();
 	}
 };
 
+template<typename ComponentT, int N, int M, int O>
+struct mul_helper<emulated_matrix<ComponentT, N, M>, emulated_matrix<ComponentT, M, O> >
+{
+    using LhsT = emulated_matrix<ComponentT, N, M>;
+    using RhsT = emulated_matrix<ComponentT, M, O>;
+    using return_t = emulated_matrix<ComponentT, N, O>;
+
+    static inline return_t __call(LhsT lhs, RhsT rhs)
+    {
+        using OutputVecType = typename matrix_traits<return_t>::row_type;
+        const uint32_t outputRowCount = vector_traits<OutputVecType>::Dimension;
+
+        nbl::hlsl::array_get<typename matrix_traits<LhsT>::row_type, typename vector_traits<typename matrix_traits<LhsT>::row_type>::scalar_type> getter;
+
+        return_t output;
+        const uint32_t RHSRowCount = matrix_traits<RhsT>::RowCount;
+        for (uint32_t rO = 0; rO < outputRowCount; ++rO)
+        {
+            output.rows[rO] = rhs.rows[0] * getter(lhs.rows[rO], 0);
+            for (uint32_t rI = 1; rI < RHSRowCount; ++rI) // its also the LHS column count
+                output.rows[rO] = output.rows[rO] + rhs.rows[rI] * getter(lhs.rows[rO], rI);
+        }
+
+        return output;
+    }
+};
+
 template<typename ComponentT, uint16_t RowCount, uint16_t ColumnCount>
 struct mul_helper<emulated_matrix<ComponentT, RowCount, ColumnCount>, emulated_vector_t<ComponentT, ColumnCount> >
 {
     using MatT = emulated_matrix<ComponentT, RowCount, ColumnCount>;
     using VecT = emulated_vector_t<ComponentT, ColumnCount>;
-    using OutVecT = emulated_vector_t<ComponentT, RowCount>;
+    using return_t = emulated_vector_t<ComponentT, RowCount>;
 
-    static inline OutVecT multiply(MatT mat, VecT vec)
+    static inline return_t __call(MatT mat, VecT vec)
     {
         nbl::hlsl::array_get<VecT, typename vector_traits<VecT>::scalar_type> getter;
         nbl::hlsl::array_set<VecT, typename vector_traits<VecT>::scalar_type> setter;
 
-        OutVecT output;
+        return_t output;
         for (int i = 0; i < RowCount; ++i)
             setter(output, i, nbl::hlsl::dot<VecT>(mat.rows[i], vec));
 
@@ -104,4 +138,4 @@ struct mul_helper<emulated_matrix<ComponentT, RowCount, ColumnCount>, emulated_v
 
 }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl
index 68fe23f355..0053008aa4 100644
--- a/include/nbl/builtin/hlsl/emulated/vector_t.hlsl
+++ b/include/nbl/builtin/hlsl/emulated/vector_t.hlsl
@@ -412,6 +412,7 @@ struct vector_traits<emulated_vector_t<T, DIMENSION> >\
 {\
     using scalar_type = T;\
     NBL_CONSTEXPR_STATIC_INLINE uint32_t Dimension = DIMENSION;\
+    NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\
 };\
 
 DEFINE_SCALAR_OF_SPECIALIZATION(2)
@@ -471,9 +472,12 @@ struct static_cast_helper<vector<ToComponentType, N>, emulated_vector_t<FromComp
 
     static inline OutputVecType cast(InputVecType vec)
     {
+        array_get<InputVecType, FromComponentType> getter;
+        array_set<OutputVecType, ToComponentType> setter;
+        
         OutputVecType output;
-        output.x = _static_cast<ToComponentType>(vec.x);
-        output.y = _static_cast<ToComponentType>(vec.y);
+        for (int i = 0; i < N; ++i)
+            setter(output, i, _static_cast<ToComponentType>(getter(vec, i)));
 
         return output;
     }
diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl
index cbbc0b8abd..08992b2fc0 100644
--- a/include/nbl/builtin/hlsl/ieee754.hlsl
+++ b/include/nbl/builtin/hlsl/ieee754.hlsl
@@ -2,6 +2,7 @@
 #define _NBL_BUILTIN_HLSL_IEE754_HLSL_INCLUDED_
 
 #include <nbl/builtin/hlsl/ieee754/impl.hlsl>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
 
 namespace nbl
 {
@@ -130,6 +131,33 @@ NBL_CONSTEXPR_INLINE_FUNC typename unsigned_integer_of_size<sizeof(T)>::type ext
 	return ieee754::impl::bitCastToUintType(x) & traits<AsFloat>::signMask;
 }
 
+template <typename FloatingPoint NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<FloatingPoint>)
+NBL_CONSTEXPR_INLINE_FUNC FloatingPoint copySign(FloatingPoint to, FloatingPoint from)
+{
+	using AsUint = typename unsigned_integer_of_size<sizeof(FloatingPoint)>::type;
+
+	const AsUint toAsUint = ieee754::impl::bitCastToUintType(to) & (~ieee754::traits<FloatingPoint>::signMask);
+	const AsUint fromAsUint = ieee754::impl::bitCastToUintType(from);
+
+	return bit_cast<FloatingPoint>(toAsUint | extractSignPreserveBitPattern(from));
+}
+
+template <typename FloatingPoint NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<FloatingPoint>)
+NBL_CONSTEXPR_INLINE_FUNC FloatingPoint flipSign(FloatingPoint val)
+{
+	using AsFloat = typename float_of_size<sizeof(FloatingPoint)>::type;
+	using AsUint = typename unsigned_integer_of_size<sizeof(FloatingPoint)>::type;
+	const AsUint asUint = ieee754::impl::bitCastToUintType(val);
+
+	return bit_cast<FloatingPoint>(asUint ^ ieee754::traits<AsFloat>::signMask);
+}
+
+template <typename FloatingPoint NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<FloatingPoint>)
+NBL_CONSTEXPR_INLINE_FUNC FloatingPoint flipSign(FloatingPoint val, bool flip)
+{
+	return flip ? flipSign(val) : val;
+}
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/math/equations/quartic.hlsl b/include/nbl/builtin/hlsl/math/equations/quartic.hlsl
index c34c25602f..8ec658ef03 100644
--- a/include/nbl/builtin/hlsl/math/equations/quartic.hlsl
+++ b/include/nbl/builtin/hlsl/math/equations/quartic.hlsl
@@ -7,6 +7,7 @@
 
 #include <nbl/builtin/hlsl/math/equations/quadratic.hlsl>
 #include <nbl/builtin/hlsl/math/equations/cubic.hlsl>
+#include <nbl/builtin/hlsl/tgmath.hlsl>
 
 // TODO: Later include from correct hlsl header
 #ifndef nbl_hlsl_FLT_EPSILON
@@ -88,7 +89,7 @@ namespace equations
                 float_t3 cubic = Cubic<float_t>::construct(1, -1.0 / 2 * p, -r, 1.0 / 2 * r * p - 1.0 / 8 * q * q).computeRoots();
                 /* ... and take the one real solution ... */
                 for (uint32_t i = 0; i < 3; i ++)
-                    if (!isnan(cubic[i]))
+                    if (!hlsl::isnan<float_t>(cubic[i]))
                     {
                         z = cubic[i];
                         break;
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
new file mode 100644
index 0000000000..a36c2027f8
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -0,0 +1,453 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_MATH_FUNCTIONS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_FUNCTIONS_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/numbers.hlsl"
+#include "nbl/builtin/hlsl/spirv_intrinsics/core.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace math
+{
+
+namespace impl
+{
+template<typename T, uint32_t LP, bool Odd=(LP&0x1)>
+struct lp_norm;
+
+// infinity case
+template<typename T>
+struct lp_norm<T,0,false>
+{
+    static scalar_type_t<T> __call(const T v)
+    {
+        scalar_type_t<T> retval = abs<T>(v[0]);
+        for (int i = 1; i < extent<T>::value; i++)
+            retval = max<T>(abs<T>(v[i]),retval);
+        return retval;
+    }
+};
+
+// TOOD: is this doing what it should be?
+template<typename T>
+struct lp_norm<T,1,false>
+{
+    static scalar_type_t<T> __sum(const T v)
+    {
+        scalar_type_t<T> retval = abs<T>(v[0]);
+        for (int i = 1; i < extent<T>::value; i++)
+            retval += abs<T>(v[i]);
+        return retval;
+    }
+
+    static scalar_type_t<T> __call(const T v)
+    {
+        return __sum(v);
+    }
+};
+
+template<typename T>
+struct lp_norm<T,2,false>
+{
+    static scalar_type_t<T> __sum(const T v)
+    {
+        return dot<T>(v, v);   // TODO: wait for overloaded dot?
+    }
+
+    static scalar_type_t<T> __call(const T v)
+    {
+        return sqrt<T>(__sum(v));
+    }
+};
+
+// TODO: even/odd cases
+}
+
+template<typename T, uint32_t LP NBL_FUNC_REQUIRES(LP>0)
+scalar_type_t<T> lpNormPreroot(NBL_CONST_REF_ARG(T) v)
+{
+    return impl::lp_norm<T,LP>::__sum(v);
+}
+
+template<typename T, uint32_t LP>
+scalar_type_t<T> lpNorm(NBL_CONST_REF_ARG(T) v)
+{
+    return impl::lp_norm<T,LP>::__call(v);
+}
+
+
+template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T, 3> reflect(vector<T, 3> I, vector<T, 3> N, T NdotI)
+{
+    return N * 2.0f * NdotI - I;
+}
+
+template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T, 3> reflect(vector<T, 3> I, vector<T, 3> N)
+{
+    T NdotI = dot<T>(N, I);
+    return reflect<T>(I, N, NdotI);
+}
+
+
+namespace impl
+{
+template<typename T>
+struct orientedEtas;
+
+template<>
+struct orientedEtas<float>
+{
+    static bool __call(NBL_REF_ARG(float) orientedEta, NBL_REF_ARG(float) rcpOrientedEta, float NdotI, float eta)
+    {
+        const bool backside = NdotI < 0.0;
+        const float rcpEta = 1.0 / eta;
+        orientedEta = backside ? rcpEta : eta;
+        rcpOrientedEta = backside ? eta : rcpEta;
+        return backside;
+    }
+};
+
+template<>
+struct orientedEtas<float32_t3>
+{
+    static bool __call(NBL_REF_ARG(float32_t3) orientedEta, NBL_REF_ARG(float32_t3) rcpOrientedEta, float NdotI, float32_t3 eta)
+    {
+        const bool backside = NdotI < 0.0;
+        const float32_t3 rcpEta = (float32_t3)1.0 / eta;
+        orientedEta = backside ? rcpEta:eta;
+        rcpOrientedEta = backside ? eta:rcpEta;
+        return backside;
+    }
+};
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T> || is_vector_v<T>)
+bool getOrientedEtas(NBL_REF_ARG(T) orientedEta, NBL_REF_ARG(T) rcpOrientedEta, scalar_type_t<T> NdotI, T eta)
+{
+    return impl::orientedEtas<T>::__call(orientedEta, rcpOrientedEta, NdotI, eta);
+}
+
+
+namespace impl
+{
+template<typename T>
+struct refract
+{
+    using this_t = refract;
+    using vector_type = vector<T,3>;
+
+    static this_t create(vector_type I, vector_type N, bool backside, T NdotI, T NdotI2, T rcpOrientedEta, T rcpOrientedEta2)
+    {
+        this_t retval;
+        retval.I = I;
+        retval.N = N;
+        retval.backside = backside;
+        retval.NdotI = NdotI;
+        retval.NdotI2 = NdotI2;
+        retval.rcpOrientedEta = rcpOrientedEta;
+        retval.rcpOrientedEta2 = rcpOrientedEta2;
+        return retval;
+    }
+
+    static this_t create(vector_type I, vector_type N, T NdotI, T eta)
+    {
+        this_t retval;
+        retval.I = I;
+        retval.N = N;
+        T orientedEta;
+        retval.backside = getOrientedEtas<T>(orientedEta, retval.rcpOrientedEta, NdotI, eta);
+        retval.NdotI = NdotI;
+        retval.NdotI2 = NdotI * NdotI;
+        retval.rcpOrientedEta2 = retval.rcpOrientedEta * retval.rcpOrientedEta;
+        return retval;
+    }
+
+    static this_t create(vector_type I, vector_type N, T eta)
+    {
+        this_t retval;
+        retval.I = I;
+        retval.N = N;
+        retval.NdotI = dot<T>(N, I);
+        T orientedEta;
+        retval.backside = getOrientedEtas<T>(orientedEta, retval.rcpOrientedEta, retval.NdotI, eta);        
+        retval.NdotI2 = retval.NdotI * retval.NdotI;
+        retval.rcpOrientedEta2 = retval.rcpOrientedEta * retval.rcpOrientedEta;
+        return retval;
+    }
+
+    T computeNdotT()
+    {
+        T NdotT2 = rcpOrientedEta2 * NdotI2 + 1.0 - rcpOrientedEta2;
+        T absNdotT = sqrt<T>(NdotT2);
+        return backside ? absNdotT : -(absNdotT);
+    }
+
+    vector_type doRefract()
+    {
+        return N * (NdotI * rcpOrientedEta + computeNdotT()) - rcpOrientedEta * I;
+    }
+
+    static vector_type doReflectRefract(bool _refract, vector_type _I, vector_type _N, T _NdotI, T _NdotTorR, T _rcpOrientedEta)
+    {    
+        return _N * (_NdotI * (_refract ? _rcpOrientedEta : 1.0f) + _NdotTorR) - _I * (_refract ? _rcpOrientedEta : 1.0f);
+    }
+
+    vector_type doReflectRefract(bool r)
+    {
+        const T NdotTorR = r ? computeNdotT() : NdotI;
+        return doReflectRefract(r, I, N, NdotI, NdotTorR, rcpOrientedEta);
+    }
+
+    vector_type I;
+    vector_type N;
+    bool backside;
+    T NdotI;
+    T NdotI2;
+    T rcpOrientedEta;
+    T rcpOrientedEta2;
+};
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T,3> refract(vector<T,3> I, vector<T,3> N, bool backside, T NdotI, T NdotI2, T rcpOrientedEta, T rcpOrientedEta2)
+{
+    impl::refract<T> r = impl::refract<T>::create(I, N, backside, NdotI, NdotI2, rcpOrientedEta, rcpOrientedEta2);
+    return r.doRefract();
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T,3> refract(vector<T,3> I, vector<T,3> N, T NdotI, T eta)
+{
+    impl::refract<T> r = impl::refract<T>::create(I, N, NdotI, eta);
+    return r.doRefract();
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T,3> refract(vector<T,3> I, vector<T,3> N, T eta)
+{
+    impl::refract<T> r = impl::refract<T>::create(I, N, eta);
+    return r.doRefract();
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+T reflectRefract_computeNdotT(bool backside, T NdotI2, T rcpOrientedEta2)
+{
+    impl::refract<T> r;
+    r.NdotI2 = NdotI2;
+    r.rcpOrientedEta2 = rcpOrientedEta2;
+    r.backside = backside;
+    return r.computeNdotT();
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T,3> reflectRefract_impl(bool _refract, vector<T,3> _I, vector<T,3> _N, T _NdotI, T _NdotTorR, T _rcpOrientedEta)
+{
+    return impl::refract<T>::doReflectRefract(_refract, _I, _N, _NdotI, _NdotTorR, _rcpOrientedEta);
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T,3> reflectRefract(bool _refract, vector<T,3> I, vector<T,3> N, bool backside, T NdotI, T NdotI2, T rcpOrientedEta, T rcpOrientedEta2)
+{
+    impl::refract<T> r = impl::refract<T>::create(I, N, backside, NdotI, NdotI2, rcpOrientedEta, rcpOrientedEta2);
+    return r.doReflectRefract(_refract);
+}
+
+template<typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+vector<T,3> reflectRefract(bool _refract, vector<T,3> I, vector<T,3> N, T NdotI, T eta)
+{
+    impl::refract<T> r = impl::refract<T>::create(I, N, NdotI, eta);
+    return r.doReflectRefract(_refract);
+}
+
+
+// valid only for `theta` in [-PI,PI]
+template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
+{
+    c = cos<T>(theta);
+    s = sqrt<T>(1.0-c*c);
+    s = (theta < 0.0) ? -s : s; // TODO: test with XOR
+}
+
+template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T>)
+matrix<T, 2, 3> frisvad(vector<T, 3> n)
+{
+	const T a = 1.0 / (1.0 + n.z);
+	const T b = -n.x * n.y * a;
+	return (n.z < -0.9999999) ? matrix<T, 2, 3>(vector<T, 3>(0.0,-1.0,0.0), vector<T, 3>(-1.0,0.0,0.0)) : 
+        matrix<T, 2, 3>(vector<T, 3>(1.0-n.x*n.x*a, b, -n.x), vector<T, 3>(b, 1.0-n.y*n.y*a, -n.y));
+}
+
+bool partitionRandVariable(float leftProb, NBL_REF_ARG(float) xi, NBL_REF_ARG(float) rcpChoiceProb)
+{
+#ifdef __HLSL_VERSION
+    NBL_CONSTEXPR float NEXT_ULP_AFTER_UNITY = asfloat(0x3f800001u);
+#else
+    NBL_CONSTEXPR float32_t NEXT_ULP_AFTER_UNITY = bit_cast<float32_t>(0x3f800001u);
+#endif
+    const bool pickRight = xi >= leftProb * NEXT_ULP_AFTER_UNITY;
+
+    // This is all 100% correct taking into account the above NEXT_ULP_AFTER_UNITY
+    xi -= pickRight ? leftProb : 0.0;
+
+    rcpChoiceProb = 1.0 / (pickRight ? (1.0 - leftProb) : leftProb);
+    xi *= rcpChoiceProb;
+
+    return pickRight;
+}
+
+
+// TODO: make it work in C++, ignoring problem for now
+#ifdef __HLSL_VERSION
+// @ return abs(x) if cond==true, max(x,0.0) otherwise
+template <typename T NBL_FUNC_REQUIRES(is_scalar_v<T> || is_vector_v<T>)
+T conditionalAbsOrMax(bool cond, T x, T limit);
+
+template <>
+float conditionalAbsOrMax<float>(bool cond, float x, float limit)
+{
+    const float condAbs = asfloat(asuint(x) & uint(cond ? 0x7fFFffFFu : 0xffFFffFFu));
+    return max(condAbs,limit);
+}
+
+template <>
+float32_t2 conditionalAbsOrMax<float32_t2>(bool cond, float32_t2 x, float32_t2 limit)
+{
+    const float32_t2 condAbs = asfloat(asuint(x) & select(cond, (uint32_t2)0x7fFFffFFu, (uint32_t2)0xffFFffFFu));
+    return max(condAbs,limit);
+}
+
+template <>
+float32_t3 conditionalAbsOrMax<float32_t3>(bool cond, float32_t3 x, float32_t3 limit)
+{
+    const float32_t3 condAbs = asfloat(asuint(x) & select(cond, (uint32_t3)0x7fFFffFFu, (uint32_t3)0xffFFffFFu));
+    return max(condAbs,limit);
+}
+
+template <>
+float32_t4 conditionalAbsOrMax<float32_t4>(bool cond, float32_t4 x, float32_t4 limit)
+{
+    const float32_t4 condAbs = asfloat(asuint(x) & select(cond, (uint32_t4)0x7fFFffFFu, (uint32_t4)0xffFFffFFu));
+    return max(condAbs,limit);
+}
+#endif
+
+namespace impl
+{
+struct trigonometry
+{
+    using this_t = trigonometry;
+
+    static this_t create()
+    {
+        this_t retval;
+        retval.tmp0 = 0;
+        retval.tmp1 = 0;
+        retval.tmp2 = 0;
+        retval.tmp3 = 0;
+        retval.tmp4 = 0;
+        retval.tmp5 = 0;
+        return retval;
+    }
+
+    static this_t create(float cosA, float cosB, float cosC, float sinA, float sinB, float sinC)
+    {
+        this_t retval;
+        retval.tmp0 = cosA;
+        retval.tmp1 = cosB;
+        retval.tmp2 = cosC;
+        retval.tmp3 = sinA;
+        retval.tmp4 = sinB;
+        retval.tmp5 = sinC;
+        return retval;
+    }
+
+    float getArccosSumofABC_minus_PI()
+    {
+        const bool AltminusB = tmp0 < (-tmp1);
+        const float cosSumAB = tmp0 * tmp1 - tmp3 * tmp4;
+        const bool ABltminusC = cosSumAB < (-tmp2);
+        const bool ABltC = cosSumAB < tmp2;
+        // apply triple angle formula
+        const float absArccosSumABC = acos<float>(clamp<float>(cosSumAB * tmp2 - (tmp0 * tmp4 + tmp3 * tmp1) * tmp5, -1.f, 1.f));
+        return ((AltminusB ? ABltC : ABltminusC) ? (-absArccosSumABC) : absArccosSumABC) + (AltminusB | ABltminusC ? numbers::pi<float> : (-numbers::pi<float>));
+    }
+
+    static void combineCosForSumOfAcos(float cosA, float cosB, float biasA, float biasB, NBL_REF_ARG(float) out0, NBL_REF_ARG(float) out1)
+    {
+        const float bias = biasA + biasB;
+        const float a = cosA;
+        const float b = cosB;
+        const bool reverse = abs<float>(min<float>(a, b)) > max<float>(a, b);
+        const float c = a * b - sqrt<float>((1.0f - a * a) * (1.0f - b * b));
+
+        if (reverse)
+        {
+            out0 = -c;
+            out1 = bias + numbers::pi<float>;
+        }
+        else
+        {
+            out0 = c;
+            out1 = bias;
+        }
+    }
+
+    float tmp0;
+    float tmp1;
+    float tmp2;
+    float tmp3;
+    float tmp4;
+    float tmp5;
+};
+}
+
+float getArccosSumofABC_minus_PI(float cosA, float cosB, float cosC, float sinA, float sinB, float sinC)
+{
+    impl::trigonometry trig = impl::trigonometry::create(cosA, cosB, cosC, sinA, sinB, sinC);
+    return trig.getArccosSumofABC_minus_PI();
+}
+
+void combineCosForSumOfAcos(float cosA, float cosB, float biasA, float biasB, NBL_REF_ARG(float) out0, NBL_REF_ARG(float) out1)
+{
+    impl::trigonometry trig = impl::trigonometry::create();
+    impl::trigonometry::combineCosForSumOfAcos(cosA, cosB, biasA, biasB, trig.tmp0, trig.tmp1);
+    out0 = trig.tmp0;
+    out1 = trig.tmp1;
+}
+
+// returns acos(a) + acos(b)
+float getSumofArccosAB(float cosA, float cosB)
+{
+    impl::trigonometry trig = impl::trigonometry::create();
+    impl::trigonometry::combineCosForSumOfAcos(cosA, cosB, 0.0f, 0.0f, trig.tmp0, trig.tmp1);
+    return acos<float>(trig.tmp0) + trig.tmp1;
+}
+
+// returns acos(a) + acos(b) + acos(c) + acos(d)
+float getSumofArccosABCD(float cosA, float cosB, float cosC, float cosD)
+{
+    impl::trigonometry trig = impl::trigonometry::create();
+    impl::trigonometry::combineCosForSumOfAcos(cosA, cosB, 0.0f, 0.0f, trig.tmp0, trig.tmp1);
+    impl::trigonometry::combineCosForSumOfAcos(cosC, cosD, 0.0f, 0.0f, trig.tmp2, trig.tmp3);
+    impl::trigonometry::combineCosForSumOfAcos(trig.tmp0, trig.tmp2, trig.tmp1, trig.tmp3, trig.tmp4, trig.tmp5);
+    return acos<float>(trig.tmp4) + trig.tmp5;
+}
+
+template<typename T, uint16_t M, uint16_t N, uint16_t P NBL_FUNC_REQUIRES(is_scalar_v<T>)
+matrix<T,M,P> applyChainRule(matrix<T,N,M> dFdG, matrix<T,M,P> dGdR)
+{
+    return mul(dFdG,dGdR);
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl
new file mode 100644
index 0000000000..bc0286e778
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/quaternion/quaternion.hlsl
@@ -0,0 +1,101 @@
+// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
+// For conditions of distribution and use, see copyright notice in nabla.h
+// See the original file in irrlicht source for authors
+
+#ifndef _NBL_BUILTIN_HLSL_MATH_QUATERNION_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_QUATERNION_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+
+//! Quaternion class for representing rotations.
+/** It provides cheap combinations and avoids gimbal locks.
+Also useful for interpolations. */
+
+template<typename float_t>
+struct quaternion
+{
+	// i*data[0] + j*data[1] + k*data[2] + data[3]
+	using vec_t = vector<float_t, 4>;
+	vector<float_t, 4> data;
+
+	//! creates identity quaternion
+	static inline quaternion create()
+	{
+		quaternion q;
+		q.data = vector<float_t, 4>(0.0f, 0.0f, 0.0f, 1.0f);
+
+		return q;
+	}
+	
+	static inline quaternion create(float_t x, float_t y, float_t z, float_t w)
+	{
+		quaternion q;
+		q.data = vector<float_t, 4>(x, y, z, w);
+
+		return q;
+	}
+
+	static inline quaternion create(NBL_CONST_REF_ARG(quaternion) other)
+	{
+		return other;
+	}
+
+	static inline quaternion create(float_t pitch, float_t yaw, float_t roll)
+	{
+		float angle;
+
+		angle = roll * 0.5f;
+		const float sr = sinf(angle);
+		const float cr = cosf(angle);
+
+		angle = pitch * 0.5f;
+		const float sp = sinf(angle);
+		const float cp = cos(angle);
+
+		angle = yaw * 0.5f;
+		const float sy = sinf(angle);
+		const float cy = cosf(angle);
+
+		const float cpcy = cp * cy;
+		const float spcy = sp * cy;
+		const float cpsy = cp * sy;
+		const float spsy = sp * sy;
+
+		quaternion<float_t> output;
+		output.data = float32_t4(sr, cr, cr, cr) * float32_t4(cpcy, spcy, cpsy, cpcy) + float32_t4(-cr, sr, -sr, sr) * float32_t4(spsy, cpsy, spcy, spsy);
+
+		return output;
+	}
+
+	// TODO:
+	//explicit quaternion(NBL_CONST_REF_ARG(float32_t3x4) m) {}
+
+	inline quaternion operator*(float_t scalar)
+	{
+		quaternion output;
+		output.data = data * scalar;
+		return output;
+	}
+
+	inline quaternion operator*(NBL_CONST_REF_ARG(quaternion) other)
+	{
+		return quaternion::create(
+			data.w * other.data.w - data.x * other.x - data.y * other.data.y - data.z * other.data.z,
+			data.w * other.data.x + data.x * other.w + data.y * other.data.z - data.z * other.data.y,
+			data.w * other.data.y - data.x * other.z + data.y * other.data.w + data.z * other.data.x,
+			data.w * other.data.z + data.x * other.y - data.y * other.data.x + data.z * other.data.w
+		);
+	}
+};
+
+} // end namespace core
+} // nbl
+
+#endif
+
diff --git a/include/nbl/builtin/hlsl/math/quaternion/quaternion_impl.hlsl b/include/nbl/builtin/hlsl/math/quaternion/quaternion_impl.hlsl
new file mode 100644
index 0000000000..d00d9ce2c4
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/quaternion/quaternion_impl.hlsl
@@ -0,0 +1,25 @@
+// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
+// For conditions of distribution and use, see copyright notice in nabla.h
+// See the original file in irrlicht source for authors
+
+#ifndef _NBL_BUILTIN_HLSL_MATH_QUATERNION_IMPL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_QUATERNION_IMPL_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+
+namespace quaternion_impl
+{
+
+}
+
+} // end namespace core
+} // nbl
+
+#endif
+
diff --git a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl
index 1da0c7764d..f9c031c8e7 100644
--- a/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl
+++ b/include/nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl
@@ -9,8 +9,17 @@ namespace nbl
 namespace hlsl
 {
 
-template<typename MatT>
-struct matrix_traits;
+template<typename T>
+struct matrix_traits
+{
+    using scalar_type = T;
+    using row_type = void;
+    using transposed_type = void;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = 1;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = 1;
+    NBL_CONSTEXPR_STATIC_INLINE bool Square = false;
+    NBL_CONSTEXPR_STATIC_INLINE bool IsMatrix = false;
+};
 
 // i choose to implement it this way because of this DXC bug: https://github.com/microsoft/DirectXShaderCompiler/issues/7007
 #define DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(ROW_COUNT, COLUMN_COUNT) \
@@ -23,12 +32,24 @@ struct matrix_traits<matrix<T, ROW_COUNT, COLUMN_COUNT> > \
     NBL_CONSTEXPR_STATIC_INLINE uint32_t RowCount = ROW_COUNT; \
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ColumnCount = COLUMN_COUNT; \
     NBL_CONSTEXPR_STATIC_INLINE bool Square = RowCount == ColumnCount; \
+    NBL_CONSTEXPR_STATIC_INLINE bool IsMatrix = true; \
 };
 
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(1, 2)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(1, 3)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(1, 4)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 1)
 DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 2)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 3)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(2, 4)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 1)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 2)
 DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 3)
-DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 4)
 DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(3, 4)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 1)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 2)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 3)
+DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION(4, 4)
 
 #undef DEFINE_MATRIX_TRAITS_TEMPLATE_SPECIALIZATION
 
diff --git a/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl
new file mode 100644
index 0000000000..d1a628ccc0
--- /dev/null
+++ b/include/nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl
@@ -0,0 +1,203 @@
+#ifndef _NBL_BUILTIN_HLSL_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATRIX_UTILS_TRANSFORMATION_MATRIX_UTILS_INCLUDED_
+#include <nbl/builtin/hlsl/math/quaternion/quaternion.hlsl>
+// TODO: remove this header when deleting vectorSIMDf.hlsl
+#ifndef __HLSL_VERSION
+#include <nbl/core/math/glslFunctions.h>
+#include "vectorSIMD.h"
+#endif
+#include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
+#include "nbl/builtin/hlsl/cpp_compat/unroll.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+
+template<typename MatT>
+MatT diagonal(float diagonal = 1)
+{
+	MatT output;
+
+	NBL_UNROLL_LIMITED(4)
+	for (uint32_t i = 0; i < matrix_traits<MatT>::RowCount; ++i)
+		NBL_UNROLL_LIMITED(4)
+		for (uint32_t j = 0; j < matrix_traits<MatT>::ColumnCount; ++j)
+			output[i][j] = 0;
+
+	NBL_UNROLL_LIMITED(4)
+	for (uint32_t diag = 0; diag < matrix_traits<MatT>::RowCount; ++diag)
+		output[diag][diag] = diagonal;
+
+	return output;
+}
+
+template<typename MatT>
+MatT identity()
+{
+	// TODO
+	// static_assert(MatT::Square);
+	return diagonal<MatT>(1);
+}
+
+// TODO: this is temporary function, delete when removing vectorSIMD
+#ifndef __HLSL_VERSION
+template<typename T>
+inline core::vectorSIMDf transformVector(NBL_CONST_REF_ARG(matrix<T, 4, 4>) mat, NBL_CONST_REF_ARG(core::vectorSIMDf) vec)
+{
+	core::vectorSIMDf output;
+	float32_t4 tmp;
+	for (int i = 0; i < 4; ++i) // rather do that that reinterpret_cast for safety
+		tmp[i] = output[i];
+
+	for (int i = 0; i < 4; ++i)
+		output[i] = hlsl::dot<float32_t4>(mat[i], tmp);
+
+	return output;
+}
+#endif
+template<typename T>
+inline matrix<T, 4, 4> getMatrix3x4As4x4(NBL_CONST_REF_ARG(matrix<T, 3, 4>) mat)
+{
+	matrix<T, 4, 4> output;
+	for (int i = 0; i < 3; ++i)
+		output[i] = mat[i];
+	output[3] = float32_t4(0.0f, 0.0f, 0.0f, 1.0f);
+
+	return output;
+}
+
+template<typename T, int N>
+inline matrix<T, 3, 3> getSub3x3(NBL_CONST_REF_ARG(matrix<T, N, 4>) mat)
+{
+	return matrix<T, 3, 3>(mat);
+}
+
+template<int N, int M>
+inline matrix<float64_t, N, M> getAs64BitPrecisionMatrix(NBL_CONST_REF_ARG(matrix<float32_t, N, M>) mat)
+{
+	matrix<float64_t, N, M> output;
+	for (int i = 0; i < N; ++i)
+		output[i] = mat[i];
+
+	return output;
+}
+
+namespace transformation_matrix_utils_impl
+{
+	// This function calculates determinant using the scalar triple product.
+	template<typename T>
+	inline T determinant_helper(NBL_CONST_REF_ARG(matrix<T, 3, 3>) mat, NBL_REF_ARG(vector<T, 3>) r1crossr2)
+	{
+		r1crossr2 = hlsl::cross(mat[1], mat[2]);
+		return hlsl::dot(mat[0], r1crossr2);
+	}
+}
+
+//! returs adjugate of the cofactor (sub 3x3) matrix
+template<typename T, int N, int M>
+inline matrix<T, 3, 3> getSub3x3TransposeCofactors(NBL_CONST_REF_ARG(matrix<T, N, M>) mat)
+{
+	static_assert(N >= 3 && M >= 3);
+
+	matrix<T, 3, 3> output;
+	vector<T, 3> row0 = vector<T, 3>(mat[0]);
+	vector<T, 3> row1 = vector<T, 3>(mat[1]);
+	vector<T, 3> row2 = vector<T, 3>(mat[2]);
+	output[0] = hlsl::cross(row1, row2);
+	output[1] = hlsl::cross(row2, row0);
+	output[2] = hlsl::cross(row0, row1);
+
+	output[0] = hlsl::cross(row0, row1);
+
+	return output;
+}
+
+template<typename T, int N>
+inline bool getSub3x3InverseTranspose(NBL_CONST_REF_ARG(matrix<T, N, 4>) matIn, NBL_CONST_REF_ARG(matrix<T, 3, 3>) matOut)
+{
+	matrix<T, 3, 3> matIn3x3 = getSub3x3(matIn);
+	vector<T, 3> r1crossr2;
+	T d = transformation_matrix_utils_impl::determinant_helper(matIn3x3, r1crossr2);
+	if (abs(d) <= FLT_MIN)
+		return false;
+	auto rcp = T(1.0f)/d;
+
+	// matrix of cofactors * 1/det
+	matOut = getSub3x3TransposeCofactors(matIn3x3);
+	matOut[0] *= rcp;
+	matOut[1] *= rcp;
+	matOut[2] *= rcp;
+
+	return true;
+}
+
+// TODO: use portable_float when merged
+//! multiplies matrices a and b, 3x4 matrices are treated as 4x4 matrices with 4th row set to (0, 0, 0 ,1)
+template<typename T>
+inline matrix<T, 3, 4> concatenateBFollowedByA(NBL_CONST_REF_ARG(matrix<T, 3, 4>) a, NBL_CONST_REF_ARG(const matrix<T, 3, 4>) b)
+{
+	// TODO
+	// static_assert(N == 3 || N == 4);
+
+	const matrix<T, 4, 4> a4x4 = getMatrix3x4As4x4<hlsl::float32_t>(a);
+	const matrix<T, 4, 4> b4x4 = getMatrix3x4As4x4<hlsl::float32_t>(b);
+	return matrix<T, 3, 4>(mul(a4x4, b4x4));
+}
+
+template<typename T, int N>
+inline void setScale(NBL_REF_ARG(matrix<T, N, 4>) outMat, NBL_CONST_REF_ARG(vector<T, 3>) scale)
+{
+	// TODO
+	// static_assert(N == 3 || N == 4);
+
+	outMat[0][0] = scale[0];
+	outMat[1][1] = scale[1];
+	outMat[2][2] = scale[2];
+}
+
+//! Replaces curent rocation and scale by rotation represented by quaternion `quat`, leaves 4th row and 4th colum unchanged
+template<typename T, int N>
+inline void setRotation(NBL_REF_ARG(matrix<T, N, 4>) outMat, NBL_CONST_REF_ARG(nbl::hlsl::quaternion<T>) quat)
+{
+	// TODO
+	//static_assert(N == 3 || N == 4);
+
+	outMat[0] = vector<T, 4>(
+		1 - 2 * (quat.data.y * quat.data.y + quat.data.z * quat.data.z),
+		2 * (quat.data.x * quat.data.y - quat.data.z * quat.data.w),
+		2 * (quat.data.x * quat.data.z + quat.data.y * quat.data.w),
+
+		outMat[0][3]
+	);
+
+	outMat[1] = vector<T, 4>(
+		2 * (quat.data.x * quat.data.y + quat.data.z * quat.data.w),
+		1 - 2 * (quat.data.x * quat.data.x + quat.data.z * quat.data.z),
+		2 * (quat.data.y * quat.data.z - quat.data.x * quat.data.w),
+		outMat[1][3]
+	);
+
+	outMat[2] = vector<T, 4>(
+		2 * (quat.data.x * quat.data.z - quat.data.y * quat.data.w),
+		2 * (quat.data.y * quat.data.z + quat.data.x * quat.data.w),
+		1 - 2 * (quat.data.x * quat.data.x + quat.data.y * quat.data.y),
+		outMat[2][3]
+	);
+}
+
+template<typename T, int N>
+inline void setTranslation(NBL_REF_ARG(matrix<T, N, 4>) outMat, NBL_CONST_REF_ARG(vector<T, 3>) translation)
+{
+	// TODO
+	// static_assert(N == 3 || N == 4);
+
+	outMat[0].w = translation.x;
+	outMat[1].w = translation.y;
+	outMat[2].w = translation.z;
+}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/projection/projection.hlsl b/include/nbl/builtin/hlsl/projection/projection.hlsl
new file mode 100644
index 0000000000..22d2872fde
--- /dev/null
+++ b/include/nbl/builtin/hlsl/projection/projection.hlsl
@@ -0,0 +1,81 @@
+#ifndef _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_
+#define _NBL_BUILTIN_HLSL_PROJECTION_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+// TODO: use glm instead for c++
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixPerspectiveFovRH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
+{
+	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians * 0.5f));
+	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
+	const float w = h / aspectRatio;
+
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(w, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -h, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, -zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, -1.f, 0.f);
+
+	return m;
+}
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixPerspectiveFovLH(float fieldOfViewRadians, float aspectRatio, float zNear, float zFar)
+{
+	const float h = core::reciprocal<float>(tanf(fieldOfViewRadians * 0.5f));
+	_NBL_DEBUG_BREAK_IF(aspectRatio == 0.f); //division by zero
+	const float w = h / aspectRatio;
+
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(w, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -h, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, zFar / (zFar - zNear), -zNear * zFar / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, 1.f, 0.f);
+
+	return m;
+}
+
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixOrthoRH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
+{
+	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(2.f / widthOfViewVolume, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -2.f / heightOfViewVolume, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, -1.f / (zFar - zNear), -zNear / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, 0.f, 1.f);
+
+	return m;
+}
+
+template<typename T>
+inline matrix<T, 4, 4> buildProjectionMatrixOrthoLH(float widthOfViewVolume, float heightOfViewVolume, float zNear, float zFar)
+{
+	_NBL_DEBUG_BREAK_IF(widthOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(heightOfViewVolume == 0.f); //division by zero
+	_NBL_DEBUG_BREAK_IF(zNear == zFar); //division by zero
+
+	matrix<T, 4, 4> m;
+	m[0] = vector<T, 4>(2.f / widthOfViewVolume, 0.f, 0.f, 0.f);
+	m[1] = vector<T, 4>(0.f, -2.f / heightOfViewVolume, 0.f, 0.f);
+	m[2] = vector<T, 4>(0.f, 0.f, 1.f / (zFar - zNear), -zNear / (zFar - zNear));
+	m[3] = vector<T, 4>(0.f, 0.f, 0.f, 1.f);
+
+	return m;
+}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/shapes/beziers.hlsl b/include/nbl/builtin/hlsl/shapes/beziers.hlsl
index d4ebe6d0d8..62116b638e 100644
--- a/include/nbl/builtin/hlsl/shapes/beziers.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/beziers.hlsl
@@ -412,7 +412,8 @@ struct Quadratic
         // p'(1/2) = 2(C-A)
             
         // exponent so large it would wipe the mantissa on any relative operation
-        const float_t PARAMETER_THRESHOLD = exp2(24);
+        // should be exp2<float_t>(numeric_limits<float_t>::digits) ater tgmath has an exp2
+        const float_t PARAMETER_THRESHOLD = exp2(24.0f);
         Candidates candidates;
             
         float_t2 Bdiv2 = B*0.5;
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 2ecb08cdb2..3ca4c5c37c 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -7,9 +7,10 @@
 
 #ifdef __HLSL_VERSION // TODO: AnastZIuk fix public search paths so we don't choke
 #include "spirv/unified1/spirv.hpp"
-#endif
 
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
 #include "nbl/builtin/hlsl/type_traits.hlsl"
+#include <nbl/builtin/hlsl/concepts.hlsl>
 
 namespace nbl 
 {
@@ -233,17 +234,37 @@ template<typename Integral>
 [[vk::ext_instruction( spv::OpBitReverse )]]
 enable_if_t<is_integral_v<Integral>, Integral> bitReverse( Integral base );
 
-template<typename FloatingPoint>
+template<typename T NBL_FUNC_REQUIRES(is_floating_point_v<T> && is_scalar_v<T>)
 [[vk::ext_instruction( spv::OpIsNan )]]
-enable_if_t<is_floating_point_v<FloatingPoint>, bool> isNan(FloatingPoint val);
+bool isNan(T val);
 
-template<typename FloatingPoint>
+template<typename T NBL_FUNC_REQUIRES(is_floating_point_v<T> && is_scalar_v<T>)
 [[vk::ext_instruction( spv::OpIsInf )]]
-enable_if_t<is_floating_point_v<FloatingPoint>, bool> isInf(FloatingPoint val);
+bool isInf(T val);
+
+template<typename T NBL_FUNC_REQUIRES(is_floating_point_v<T> && is_vector_v<T>)
+[[vk::ext_instruction(spv::OpIsNan)]]
+vector<bool, vector_traits<T>::Dimension> isNan(T val);
+
+template<typename T NBL_FUNC_REQUIRES(is_floating_point_v<T> && is_vector_v<T>)
+[[vk::ext_instruction(spv::OpIsInf)]]
+vector<bool, vector_traits<T>::Dimension> isInf(T val);
 
 template<typename Matrix>
 [[vk::ext_instruction( spv::OpTranspose )]]
-Matrix transpose(NBL_CONST_REF_ARG(Matrix) mat);
+Matrix transpose(Matrix mat);
+
+template<typename Integral>
+[[vk::ext_instruction(spv::OpBitCount)]]
+enable_if_t<is_integral_v<Integral>, Integral> bitCount(Integral mat);
+
+template<typename BooleanVector>
+[[vk::ext_instruction(spv::OpAll)]]
+enable_if_t<is_vector_v<BooleanVector> && is_same_v<typename vector_traits<BooleanVector>::scalar_type, bool>, BooleanVector> all(BooleanVector vec);
+
+template<typename BooleanVector>
+[[vk::ext_instruction(spv::OpAny)]]
+enable_if_t<is_vector_v<BooleanVector>&& is_same_v<typename vector_traits<BooleanVector>::scalar_type, bool>, BooleanVector> any(BooleanVector vec);
 
 }
 
@@ -252,3 +273,4 @@ Matrix transpose(NBL_CONST_REF_ARG(Matrix) mat);
 }
 
 #endif
+#endif
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl
index 47cd335c2f..53c5c872c0 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl
@@ -2,64 +2,98 @@
 #define _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_GLSL_STD_450_INCLUDED_
 
 #ifdef __HLSL_VERSION
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+#include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
 #include <nbl/builtin/hlsl/cpp_compat/basic.h>
 #include <nbl/builtin/hlsl/concepts.hlsl>
+#include <nbl/builtin/hlsl/tgmath/output_structs.hlsl>
 #include "spirv/unified1/GLSL.std.450.h"
 
-namespace nbl 
+namespace nbl
 {
 namespace hlsl
 {
 namespace spirv
 {
-// Find MSB and LSB restricted to 32-bit width component types https://registry.khronos.org/SPIR-V/specs/unified1/GLSL.std.450.html
-template<typename Integral32 NBL_FUNC_REQUIRES(is_same_v<Integral32, int32_t> || is_same_v<Integral32, uint32_t>)
-[[vk::ext_instruction(GLSLstd450::GLSLstd450FindILsb, "GLSL.std.450")]]
-Integral32 findILsb(Integral32 value);
 
-template<int N>
-[[vk::ext_instruction(GLSLstd450::GLSLstd450FindILsb, "GLSL.std.450")]]
-vector<int32_t, N> findILsb(vector<int32_t, N> value);
+namespace concepts
+{
+// scalar or vector whose component type is floating-point.
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPointVectorOrScalar = is_floating_point_v<T> && (!is_matrix_v<T>);
+//  scalar or vector whose component type is 16-bit or 32-bit floating-point.
+template<typename T>
+NBL_BOOL_CONCEPT FloatingPointVectorOrScalar32or16BitSize = FloatingPointVectorOrScalar<T> && (sizeof(typename vector_traits<T>::scalar_type) == 4 || sizeof(typename vector_traits<T>::scalar_type) == 2);
+//is interpreted as signed
+//integer scalar or integer vector types
+template<typename T>
+NBL_BOOL_CONCEPT IntegralVectorOrScalar = is_integral_v<T> && is_signed_v<T> && !is_matrix_v<T>;
+//interpreted as unsigned
+//integer scalar or integer vector types
+template<typename T>
+NBL_BOOL_CONCEPT UnsignedIntegralVectorOrScalar = is_integral_v<T> && is_unsigned_v<T> && !is_matrix_v<T>;
+//be signed integer scalar or signed integer vector types
+//This instruction is currently limited to 32 - bit width components.
+template<typename T>
+NBL_BOOL_CONCEPT IntegralVectorOrScalar32BitSize = IntegralVectorOrScalar<T> && (sizeof(typename vector_traits<T>::scalar_type) == 4);
+//be unsigned integer scalar or unsigned integer vector types
+//This instruction is currently limited to 32 - bit width components.
+template<typename T>
+NBL_BOOL_CONCEPT UnsignedIntegralVectorOrScalar32BitSize = UnsignedIntegralVectorOrScalar<T> && (sizeof(typename vector_traits<T>::scalar_type) == 4);
+}
 
-template<int N>
+// Find MSB and LSB restricted to 32-bit width component types https://registry.khronos.org/SPIR-V/specs/unified1/GLSL.std.450.html
+template<typename T NBL_FUNC_REQUIRES(concepts::IntegralVectorOrScalar32BitSize<T> || concepts::UnsignedIntegralVectorOrScalar32BitSize<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450FindILsb, "GLSL.std.450")]]
-vector<uint32_t, N> findILsb(vector<uint32_t, N> value);
-
-template<typename Int32_t NBL_FUNC_REQUIRES(is_same_v<Int32_t, int32_t>)
-[[vk::ext_instruction(GLSLstd450::GLSLstd450FindSMsb, "GLSL.std.450")]]
-int32_t findSMsb(Int32_t value);
+conditional_t<is_vector_v<T>, vector<int32_t, vector_traits<T>::Dimension>, int32_t> findILsb(T value);
 
-template<int N>
+template<typename T NBL_FUNC_REQUIRES(concepts::IntegralVectorOrScalar32BitSize<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450FindSMsb, "GLSL.std.450")]]
-vector<int32_t, N> findSMsb(vector<int32_t, N> value);
+conditional_t<is_vector_v<T>, vector<int32_t, vector_traits<T>::Dimension>, int32_t> findSMsb(T value);
 
-template<typename Uint32_t NBL_FUNC_REQUIRES(is_same_v<Uint32_t, uint32_t>)
+template<typename T NBL_FUNC_REQUIRES(concepts::UnsignedIntegralVectorOrScalar32BitSize<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450FindUMsb, "GLSL.std.450")]]
-int32_t findUMsb(Uint32_t value);
+conditional_t<is_vector_v<T>, vector<int32_t, vector_traits<T>::Dimension>, int32_t> findUMsb(T value);
 
-template<int N>
-[[vk::ext_instruction(GLSLstd450::GLSLstd450FindUMsb, "GLSL.std.450")]]
-vector<uint32_t, N> findUMsb(vector<uint32_t, N> value);
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450::GLSLstd450Pow, "GLSL.std.450")]]
+T pow(T lhs, T rhs);
 
-template<typename FloatingPoint>
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450::GLSLstd450Exp, "GLSL.std.450")]]
+T exp(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450Exp2, "GLSL.std.450")]]
-enable_if_t<is_floating_point<FloatingPoint>::value && !is_matrix_v<FloatingPoint>, FloatingPoint> exp2(FloatingPoint val);
+T exp2(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450::GLSLstd450Log, "GLSL.std.450")]]
+T log(T val);
 
-template<typename FloatingPoint>
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450::GLSLstd450Log2, "GLSL.std.450")]]
+T log2(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450::GLSLstd450Sqrt, "GLSL.std.450")]]
+T sqrt(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450InverseSqrt, "GLSL.std.450")]]
-enable_if_t<is_floating_point_v<FloatingPoint> && !is_matrix_v<FloatingPoint>, FloatingPoint> inverseSqrt(FloatingPoint val);
+T inverseSqrt(T val);
  
-template<typename FloatingPoint>
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450Floor, "GLSL.std.450")]]
-enable_if_t<is_floating_point_v<FloatingPoint> && !is_matrix_v<FloatingPoint>, FloatingPoint> floor(FloatingPoint val);
+T floor(T val);
 
-template<typename FloatingPoint>
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T> && is_scalar_v<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450Cross, "GLSL.std.450")]]
-enable_if_t<is_floating_point_v<FloatingPoint>, vector<FloatingPoint, 3> > cross(NBL_CONST_REF_ARG(vector<FloatingPoint, 3>) lhs, NBL_CONST_REF_ARG(vector<FloatingPoint, 3>) rhs);
+vector<T, 3> cross(NBL_CONST_REF_ARG(vector<T, 3>) lhs, NBL_CONST_REF_ARG(vector<T, 3>) rhs);
 
-template<typename FloatingPoint>
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
 [[vk::ext_instruction(GLSLstd450::GLSLstd450FMix, "GLSL.std.450")]]
-enable_if_t<is_floating_point_v<FloatingPoint> && !is_matrix_v<FloatingPoint>, FloatingPoint> fMix(FloatingPoint x, FloatingPoint y, FloatingPoint a);
+T fMix(T x, T y, T a);
 
 template<typename T, int N>
 [[vk::ext_instruction(GLSLstd450::GLSLstd450Determinant, "GLSL.std.450")]]
@@ -78,6 +112,174 @@ float32_t4 unpackSnorm4x8(uint32_t p);
 [[vk::ext_instruction(GLSLstd450UnpackUnorm4x8, "GLSL.std.450")]]
 float32_t4 unpackUnorm4x8(uint32_t p);
 
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Length, "GLSL.std.450")]]
+typename vector_traits<T>::scalar_type length(T vec);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Normalize, "GLSL.std.450")]]
+T normalize(T vec);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450FClamp, "GLSL.std.450")]]
+T fClamp(T val, T _min, T _max);
+template<typename T NBL_FUNC_REQUIRES(concepts::UnsignedIntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450UClamp, "GLSL.std.450")]]
+T uClamp(T val, T _min, T _max);
+template<typename T NBL_FUNC_REQUIRES(concepts::IntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450SClamp, "GLSL.std.450")]]
+T sClamp(T val, T _min, T _max);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450FMin, "GLSL.std.450")]]
+T fMin(T val);
+template<typename T NBL_FUNC_REQUIRES(concepts::UnsignedIntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450UMin, "GLSL.std.450")]]
+T uMin(T val);
+template<typename T NBL_FUNC_REQUIRES(concepts::IntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450SMin, "GLSL.std.450")]]
+T sMin(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450FMax, "GLSL.std.450")]]
+T fMax(T val);
+template<typename T NBL_FUNC_REQUIRES(concepts::UnsignedIntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450UMax, "GLSL.std.450")]]
+T uMax(T val);
+template<typename T NBL_FUNC_REQUIRES(concepts::IntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450SMax, "GLSL.std.450")]]
+T sMax(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450FAbs, "GLSL.std.450")]]
+T fAbs(T val);
+template<typename T NBL_FUNC_REQUIRES(concepts::IntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450SAbs, "GLSL.std.450")]]
+T sAbs(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450Sin, "GLSL.std.450")]]
+T sin(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450Cos, "GLSL.std.450")]]
+T cos(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450Acos, "GLSL.std.450")]]
+T acos(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Fract, "GLSL.std.450")]]
+T fract(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Round, "GLSL.std.450")]]
+T round(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450RoundEven, "GLSL.std.450")]]
+T roundEven(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Trunc, "GLSL.std.450")]]
+T trunc(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Ceil, "GLSL.std.450")]]
+T ceil(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Fma, "GLSL.std.450")]]
+T fma(T x, T y, T z);
+
+template<typename T, typename U NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T> && 
+	(concepts::IntegralVectorOrScalar<U> || concepts::UnsignedIntegralVectorOrScalar<U>) && 
+	(vector_traits<T>::Dimension == vector_traits<U>::Dimension))
+[[vk::ext_instruction(GLSLstd450Ldexp, "GLSL.std.450")]]
+T ldexp(T arg, U exp);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450FSign, "GLSL.std.450")]]
+T fSign(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::IntegralVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450SSign, "GLSL.std.450")]]
+T sSign(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450Radians, "GLSL.std.450")]]
+T radians(T degrees);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar32or16BitSize<T>)
+[[vk::ext_instruction(GLSLstd450Degrees, "GLSL.std.450")]]
+T degrees(T radians);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Step, "GLSL.std.450")]]
+T step(T edge, T x);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450SmoothStep, "GLSL.std.450")]]
+T smoothStep(T edge0, T edge1, T x);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450FaceForward, "GLSL.std.450")]]
+T faceForward(T N, T I, T Nref);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Reflect, "GLSL.std.450")]]
+T reflect(T I, T N);
+
+template<typename T, typename U NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450Refract, "GLSL.std.450")]]
+T refract(T I, T N, U Nref);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450ModfStruct, "GLSL.std.450")]]
+ModfOutput<T> modfStruct(T val);
+
+template<typename T NBL_FUNC_REQUIRES(concepts::FloatingPointVectorOrScalar<T>)
+[[vk::ext_instruction(GLSLstd450FrexpStruct, "GLSL.std.450")]]
+FrexpOutput<T> frexpStruct(T val);
+
+[[vk::ext_instruction(GLSLstd450PackSnorm4x8, "GLSL.std.450")]]
+int32_t packSnorm4x8(float32_t4 vec);
+
+[[vk::ext_instruction(GLSLstd450PackUnorm4x8, "GLSL.std.450")]]
+int32_t packUnorm4x8(float32_t4 vec);
+
+[[vk::ext_instruction(GLSLstd450PackSnorm2x16, "GLSL.std.450")]]
+int32_t packSnorm2x16(float32_t2 vec);
+
+[[vk::ext_instruction(GLSLstd450PackUnorm2x16, "GLSL.std.450")]]
+int32_t packUnorm2x16(float32_t2 vec);
+
+[[vk::ext_instruction(GLSLstd450PackHalf2x16, "GLSL.std.450")]]
+int32_t packHalf2x16(float32_t2 vec);
+
+[[vk::ext_instruction(GLSLstd450PackDouble2x32, "GLSL.std.450")]]
+float64_t packDouble2x32(int32_t2 vec);
+
+[[vk::ext_instruction(GLSLstd450UnpackSnorm2x16, "GLSL.std.450")]]
+float32_t2 unpackSnorm2x16(int32_t vec);
+
+[[vk::ext_instruction(GLSLstd450UnpackUnorm2x16, "GLSL.std.450")]]
+float32_t2 unpackUnorm2x16(int32_t vec);
+
+[[vk::ext_instruction(GLSLstd450UnpackHalf2x16, "GLSL.std.450")]]
+float32_t2 unpackHalf2x16(int32_t vec);
+
+[[vk::ext_instruction(GLSLstd450UnpackSnorm4x8, "GLSL.std.450")]]
+float32_t4 unpackSnorm4x8(int32_t vec);
+
+[[vk::ext_instruction(GLSLstd450UnpackUnorm4x8, "GLSL.std.450")]]
+float32_t4 unpackUnorm4x8(int32_t vec);
+
+[[vk::ext_instruction(GLSLstd450UnpackDouble2x32, "GLSL.std.450")]]
+int32_t2 unpackDouble2x32(float64_t vec);
+
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/tgmath.hlsl b/include/nbl/builtin/hlsl/tgmath.hlsl
index fade1faf2a..ce1ab4d662 100644
--- a/include/nbl/builtin/hlsl/tgmath.hlsl
+++ b/include/nbl/builtin/hlsl/tgmath.hlsl
@@ -4,70 +4,177 @@
 #ifndef _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_
 #define _NBL_BUILTIN_HLSL_TGMATH_INCLUDED_
 
+#include <nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl>
+#include <nbl/builtin/hlsl/tgmath/impl.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/ieee754.hlsl>
-#include <nbl/builtin/hlsl/type_traits.hlsl>
-
 #include <nbl/builtin/hlsl/spirv_intrinsics/core.hlsl>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
+#include <nbl/builtin/hlsl/concepts/vector.hlsl>
+// C++ headers
+#ifndef __HLSL_VERSION
+#include <algorithm>
+#include <cmath>
+#endif
 
 namespace nbl
 {
 namespace hlsl
 {
-namespace tgmath
-{
-
-//template <typename T>
-//inline bool isNaN(T val)
-//{
-//	using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
-//	using AsFloat = typename float_of_size<sizeof(T)>::type;
-//
-//	AsUint asUint = bit_cast<AsUint, T>(val);
-//	return bool((ieee754::extractBiasedExponent<T>(val) == ieee754::traits<AsFloat>::specialValueExp) && (asUint & ieee754::traits<AsFloat>::mantissaMask));
-//}
-//
-//template<typename T>
-//inline bool isInf(T val)
-//{
-//	using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
-//	using AsFloat = typename float_of_size<sizeof(T)>::type;
-//
-//	AsUint tmp = bit_cast<AsUint>(val);
-//	return (tmp & (~ieee754::traits<AsFloat>::signMask)) == ieee754::traits<AsFloat>::inf;
-//}
-//
-//#ifdef __HLSL_VERSION
-//#define DEFINE_IS_NAN_SPECIALIZATION(TYPE)\
-//template<>\
-//inline bool isNaN<TYPE>(TYPE val)\
-//{\
-//	return spirv::isNan(val);\
-//}\
-//
-//#define DEFINE_IS_INF_SPECIALIZATION(TYPE)\
-//template<>\
-//inline bool isInf<TYPE>(TYPE val)\
-//{\
-//	return spirv::isInf(val);\
-//}\
-//
-//DEFINE_IS_NAN_SPECIALIZATION(float16_t)
-//DEFINE_IS_NAN_SPECIALIZATION(float32_t)
-//DEFINE_IS_NAN_SPECIALIZATION(float64_t)
-//
-//DEFINE_IS_INF_SPECIALIZATION(float16_t)
-//DEFINE_IS_INF_SPECIALIZATION(float32_t)
-//DEFINE_IS_INF_SPECIALIZATION(float64_t)
-//
-//#undef DEFINE_IS_INF_SPECIALIZATION
-//#undef DEFINE_IS_NAN_SPECIALIZATION
-//#undef INTRINSIC_FUNC_NAMESPACE
-//#endif
-
-}
-
-}
-}
-
-#endif
\ No newline at end of file
+template<typename T>
+inline T erf(T x)
+{
+    return tgmath_impl::erf_helper<T>::__call(x);
+}
+
+template<typename T>
+inline T erfInv(T x)
+{
+    return tgmath_impl::erfInv_helper<T>::__call(x);
+}
+
+template<typename T>
+inline T floor(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::floor_helper<T>::__call(val);
+}
+
+template<typename T>
+inline typename tgmath_impl::isnan_helper<T>::return_t isnan(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::isnan_helper<T>::__call(val);
+}
+
+template<typename T>
+inline typename tgmath_impl::isinf_helper<T>::return_t isinf(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::isinf_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T pow(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y)
+{
+    return tgmath_impl::pow_helper<T>::__call(x, y);
+}
+
+template<typename T>
+inline T exp(NBL_CONST_REF_ARG(T) x)
+{
+    return tgmath_impl::exp_helper<T>::__call(x);
+}
+
+template<typename T>
+inline T exp2(NBL_CONST_REF_ARG(T) x)
+{
+    return tgmath_impl::exp2_helper<T>::__call(x);
+}
+
+template<typename T>
+inline T log(NBL_CONST_REF_ARG(T) x)
+{
+    return tgmath_impl::log_helper<T>::__call(x);
+}
+
+template<typename T>
+inline T log2(NBL_CONST_REF_ARG(T) x)
+{
+    return tgmath_impl::log2_helper<T>::__call(x);
+}
+
+template<typename T>
+inline T abs(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::abs_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T sqrt(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::sqrt_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T sin(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::sin_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T cos(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::cos_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T acos(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::acos_helper<T>::__call(val);
+}
+
+/**
+* @brief Returns fractional part of given floating-point value.
+*
+* @tparam T type of the value to operate on.
+*
+* @param [in] val The value to retrive fractional part from.
+*/
+template<typename T>
+inline T modf(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::modf_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T round(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::round_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T roundEven(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::roundEven_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T trunc(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::trunc_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T ceil(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::ceil_helper<T>::__call(val);
+}
+
+template<typename T>
+inline T fma(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y, NBL_CONST_REF_ARG(T) z)
+{
+    return tgmath_impl::fma_helper<T>::__call(x, y, z);
+}
+
+template<typename T, typename U>
+inline T ldexp(NBL_CONST_REF_ARG(T) arg, NBL_CONST_REF_ARG(U) exp)
+{
+    return tgmath_impl::ldexp_helper<T, U>::__call(arg, exp);
+}
+
+template<typename T>
+inline ModfOutput<T> modfStruct(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::modfStruct_helper<T>::__call(val);
+}
+
+template<typename T>
+inline FrexpOutput<T> frexpStruct(NBL_CONST_REF_ARG(T) val)
+{
+    return tgmath_impl::frexpStruct_helper<T>::__call(val);
+}
+
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
new file mode 100644
index 0000000000..c49eb44f82
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
@@ -0,0 +1,585 @@
+#ifndef _NBL_BUILTIN_HLSL_TGMATH_IMPL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TGMATH_IMPL_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat/basic.h>
+#include <nbl/builtin/hlsl/concepts.hlsl>
+#include <nbl/builtin/hlsl/concepts/vector.hlsl>
+#include <nbl/builtin/hlsl/spirv_intrinsics/core.hlsl>
+#include <nbl/builtin/hlsl/spirv_intrinsics/glsl.std.450.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
+#include <nbl/builtin/hlsl/tgmath/output_structs.hlsl>
+
+// C++ includes
+#ifndef __HLSL_VERSION
+#include <cmath>
+#include <tgmath.h>
+#endif
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace tgmath_impl
+{
+
+template<typename UnsignedInteger NBL_FUNC_REQUIRES(hlsl::is_integral_v<UnsignedInteger> && hlsl::is_unsigned_v<UnsignedInteger>)
+inline bool isnan_uint_impl(UnsignedInteger val)
+{
+	using AsFloat = typename float_of_size<sizeof(UnsignedInteger)>::type;
+	UnsignedInteger absVal = val & (hlsl::numeric_limits<UnsignedInteger>::max >> 1);
+	return absVal > (ieee754::traits<AsFloat>::specialValueExp << ieee754::traits<AsFloat>::mantissaBitCnt);
+}
+template<typename UnsignedInteger NBL_FUNC_REQUIRES(hlsl::is_integral_v<UnsignedInteger>&& hlsl::is_unsigned_v<UnsignedInteger>)
+inline bool isinf_uint_impl(UnsignedInteger val)
+{
+	using AsFloat = typename float_of_size<sizeof(UnsignedInteger)>::type;
+	return (val & (~ieee754::traits<AsFloat>::signMask)) == ieee754::traits<AsFloat>::inf;
+}
+
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct erf_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct erfInv_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct isnan_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct isinf_helper;
+template<typename V NBL_STRUCT_CONSTRAINABLE>
+struct floor_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct pow_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct exp_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct exp2_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct log_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct log2_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct abs_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct cos_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct sin_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct acos_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct sqrt_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct modf_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct round_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct roundEven_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct trunc_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct ceil_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct fma_helper;
+template<typename T, typename U NBL_STRUCT_CONSTRAINABLE>
+struct ldexp_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct modfStruct_helper;
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct frexpStruct_helper;
+
+#ifdef __HLSL_VERSION
+
+#define DECLVAL(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) experimental::declval<_T>()
+#define DECL_ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) const _T arg##i
+#define WRAP(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) _T
+#define ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) arg##i
+
+// the template<> needs to be written ourselves
+// return type is __VA_ARGS__ to protect against `,` in templated return types
+#define AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(HELPER_NAME, SPIRV_FUNCTION_NAME, ARG_TYPE_LIST, ARG_TYPE_SET, ...)\
+NBL_PARTIAL_REQ_TOP(is_same_v<decltype(spirv::SPIRV_FUNCTION_NAME<T>(BOOST_PP_SEQ_FOR_EACH_I(DECLVAL, _, ARG_TYPE_SET))), __VA_ARGS__ >) \
+struct HELPER_NAME<BOOST_PP_SEQ_FOR_EACH_I(WRAP, _, ARG_TYPE_LIST) NBL_PARTIAL_REQ_BOT(is_same_v<decltype(spirv::SPIRV_FUNCTION_NAME<T>(BOOST_PP_SEQ_FOR_EACH_I(DECLVAL, _, ARG_TYPE_SET))), __VA_ARGS__ >) >\
+{\
+	using return_t = __VA_ARGS__;\
+	static inline return_t __call( BOOST_PP_SEQ_FOR_EACH_I(DECL_ARG, _, ARG_TYPE_SET) )\
+	{\
+		return spirv::SPIRV_FUNCTION_NAME<T>( BOOST_PP_SEQ_FOR_EACH_I(ARG, _, ARG_TYPE_SET) );\
+	}\
+};
+
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sin_helper, sin, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cos_helper, cos, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acos_helper, acos, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(abs_helper, sAbs, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(abs_helper, fAbs, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sqrt_helper, sqrt, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(log_helper, log, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(log2_helper, log2, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(exp2_helper, exp2, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(exp_helper, exp, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(floor_helper, floor, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(round_helper, round, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(roundEven_helper, roundEven, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(trunc_helper, trunc, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(ceil_helper, ceil, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(pow_helper, pow, (T), (T)(T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(fma_helper, fma, (T), (T)(T)(T), T)
+template<typename T, typename U> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(ldexp_helper, ldexp, (T)(U), (T)(U), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(modfStruct_helper, modfStruct, (T), (T), ModfOutput<T>)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(frexpStruct_helper, frexpStruct, (T), (T), FrexpOutput<T>)
+
+#define ISINF_AND_ISNAN_RETURN_TYPE conditional_t<is_vector_v<T>, vector<bool, vector_traits<T>::Dimension>, bool>
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(isinf_helper, isInf, (T), (T), ISINF_AND_ISNAN_RETURN_TYPE)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(isnan_helper, isNan, (T), (T), ISINF_AND_ISNAN_RETURN_TYPE)
+#undef ISINF_AND_ISNAN_RETURN_TYPE 
+
+#undef DECLVAL
+#undef DECL_ARG
+#undef WRAP
+#undef ARG
+#undef AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER
+
+template<typename T> NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<T>)
+struct modf_helper<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<T>) >
+{
+	using return_t = T;
+	static inline return_t __call(const T x)
+	{
+		T tmp = abs_helper<T>::__call(x);
+		tmp = spirv::fract<T>(tmp);
+		if (x < 0)
+			tmp *= -1;
+
+		return tmp;
+	}
+};
+
+template<typename T> NBL_PARTIAL_REQ_TOP(concepts::FloatingPoint<T> && is_vector_v<T>)
+struct modf_helper<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPoint<T> && is_vector_v<T>) >
+{
+	using return_t = T;
+	static inline return_t __call(const T x)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<T, typename traits::scalar_type> setter;
+
+		return_t output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, modf_helper<typename traits::scalar_type>::__call(getter(x, i)));
+
+		return output;
+	}
+};
+
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<FloatingPoint>)
+struct erf_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<FloatingPoint>) >
+{
+	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) _x)
+	{
+		const FloatingPoint a1 = 0.254829592;
+		const FloatingPoint a2 = -0.284496736;
+		const FloatingPoint a3 = 1.421413741;
+		const FloatingPoint a4 = -1.453152027;
+		const FloatingPoint a5 = 1.061405429;
+		const FloatingPoint p = 0.3275911;
+
+		FloatingPoint sign = sign(_x);
+		FloatingPoint x = abs(_x);
+
+		FloatingPoint t = 1.0 / (1.0 + p * x);
+		FloatingPoint y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
+
+		return sign * y;
+	}
+};
+
+#else // C++ only specializations
+
+#define DECL_ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) const _T arg##i
+#define WRAP(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) _T
+#define ARG(r,data,i,_T) BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(i,0)) arg##i
+
+// not giving an explicit template parameter to std function below because not every function used here is templated
+#define AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(HELPER_NAME, STD_FUNCTION_NAME, REQUIREMENT, ARG_TYPE_LIST, ARG_TYPE_SET, ...)\
+requires REQUIREMENT \
+struct HELPER_NAME<BOOST_PP_SEQ_FOR_EACH_I(WRAP, _, ARG_TYPE_LIST)>\
+{\
+	using return_t = __VA_ARGS__;\
+	static inline return_t __call( BOOST_PP_SEQ_FOR_EACH_I(DECL_ARG, _, ARG_TYPE_SET) )\
+	{\
+		return std::STD_FUNCTION_NAME( BOOST_PP_SEQ_FOR_EACH_I(ARG, _, ARG_TYPE_SET) );\
+	}\
+};
+
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(cos_helper, cos, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sin_helper, sin, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(acos_helper, acos, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(sqrt_helper, sqrt, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(abs_helper, abs, concepts::Scalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(log_helper, log, concepts::Scalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(log2_helper, log2, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(exp2_helper, exp2, concepts::Scalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(exp_helper, exp, concepts::Scalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(floor_helper, floor, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(round_helper, round, concepts::FloatingPointScalar<T>, (T), (T), T)
+// TODO: uncomment when C++23
+//template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(roundEven_helper, roundeven, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(trunc_helper, trunc, concepts::FloatingPointScalar<T>, (T), (T), T)
+template<typename T> AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER(ceil_helper, ceil, concepts::FloatingPointScalar<T>, (T), (T), T)
+
+#undef DECL_ARG
+#undef WRAP
+#undef ARG
+#undef AUTO_SPECIALIZE_TRIVIAL_CASE_HELPER
+
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct pow_helper<T>
+{
+	using return_t = T;
+	static inline return_t __call(const T x, const T y)
+	{
+		return std::pow<T>(x, y);
+	}
+};
+
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct modf_helper<T>
+{
+	using return_t = T;
+	static inline return_t __call(const T x)
+	{
+		T tmp;
+		return std::modf(x, &tmp);
+	}
+};
+
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct isinf_helper<T>
+{
+	using return_t = bool;
+	static inline return_t __call(const T arg)
+	{
+		// GCC and Clang will always return false with call to std::isinf when fast math is enabled,
+		// this implementation will always return appropriate output regardless is fas math is enabled or not
+		using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
+		return tgmath_impl::isinf_uint_impl(reinterpret_cast<const AsUint&>(arg));
+	}
+};
+
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct isnan_helper<T>
+{
+	using return_t = bool;
+	static inline return_t __call(const T arg)
+	{
+		// GCC and Clang will always return false with call to std::isnan when fast math is enabled,
+		// this implementation will always return appropriate output regardless is fas math is enabled or not
+		using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
+		return tgmath_impl::isnan_uint_impl(reinterpret_cast<const AsUint&>(arg));
+	}
+};
+
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<FloatingPoint>)
+struct erf_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<FloatingPoint>) >
+{
+	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) x)
+	{
+		return std::erf(x);
+	}
+};
+
+// TODO: remove when C++23
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<FloatingPoint>)
+struct roundEven_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<FloatingPoint>) >
+{
+	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) x)
+	{
+		// TODO: no way this is optimal, find a better implementation
+		float tmp;
+		if (std::abs(std::modf(x, &tmp)) == 0.5f)
+		{
+			int32_t result = static_cast<int32_t>(x);
+			if (result % 2 != 0)
+				result >= 0 ? ++result : --result;
+			return result;
+		}
+
+		return std::round(x);
+	}
+};
+
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<FloatingPoint>)
+struct fma_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<FloatingPoint>) >
+{
+	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) x, NBL_CONST_REF_ARG(FloatingPoint) y, NBL_CONST_REF_ARG(FloatingPoint) z)
+	{
+		return std::fma(x, y, z);
+	}
+};
+
+template<typename T, typename U>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<T> && concepts::IntegralScalar<U>)
+struct ldexp_helper<T, U NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<T> && concepts::IntegralScalar<U>) >
+{
+	static T __call(NBL_CONST_REF_ARG(T) arg, NBL_CONST_REF_ARG(U) exp)
+	{
+		return std::ldexp(arg, exp);
+	}
+};
+
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct modfStruct_helper<T>
+{
+	using return_t = ModfOutput<T>;
+	static inline return_t __call(const T val)
+	{
+		return_t output;
+		output.fractionalPart = std::modf(val, &output.wholeNumberPart);
+
+		return output;
+	}
+};
+
+template<typename T>
+requires concepts::FloatingPointScalar<T>
+struct frexpStruct_helper<T>
+{
+	using return_t = FrexpOutput<T>;
+	static inline return_t __call(const T val)
+	{
+		return_t output;
+		output.significand = std::frexp(val, &output.exponent);
+
+		return output;
+	}
+};
+
+#endif // C++ only specializations
+
+// C++ and HLSL specializations
+
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<FloatingPoint>)
+struct erfInv_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<FloatingPoint>) >
+{
+	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) _x)
+	{
+		FloatingPoint x = clamp<FloatingPoint>(_x, -0.99999, 0.99999);
+
+		FloatingPoint w = -log_helper<FloatingPoint>::__call((1.0 - x) * (1.0 + x));
+		FloatingPoint p;
+		if (w < 5.0)
+		{
+			w -= 2.5;
+			p = 2.81022636e-08;
+			p = 3.43273939e-07 + p * w;
+			p = -3.5233877e-06 + p * w;
+			p = -4.39150654e-06 + p * w;
+			p = 0.00021858087 + p * w;
+			p = -0.00125372503 + p * w;
+			p = -0.00417768164 + p * w;
+			p = 0.246640727 + p * w;
+			p = 1.50140941 + p * w;
+		}
+		else
+		{
+			w = sqrt_helper<FloatingPoint>::__call(w) - 3.0;
+			p = -0.000200214257;
+			p = 0.000100950558 + p * w;
+			p = 0.00134934322 + p * w;
+			p = -0.00367342844 + p * w;
+			p = 0.00573950773 + p * w;
+			p = -0.0076224613 + p * w;
+			p = 0.00943887047 + p * w;
+			p = 1.00167406 + p * w;
+			p = 2.83297682 + p * w;
+		}
+		return p * x;
+	}
+};
+
+#ifdef __HLSL_VERSION
+// SPIR-V already defines specializations for builtin vector types
+#define VECTOR_SPECIALIZATION_CONCEPT concepts::Vectorial<T> && !is_vector_v<T>
+#else
+#define VECTOR_SPECIALIZATION_CONCEPT concepts::Vectorial<T>
+#endif
+
+#define AUTO_SPECIALIZE_HELPER_FOR_VECTOR(HELPER_NAME, RETURN_TYPE)\
+template<typename T>\
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)\
+struct HELPER_NAME<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >\
+{\
+	using return_t = RETURN_TYPE;\
+	static return_t __call(NBL_CONST_REF_ARG(T) vec)\
+	{\
+		using traits = hlsl::vector_traits<T>;\
+		using return_t_traits = hlsl::vector_traits<return_t>;\
+		array_get<T, typename traits::scalar_type> getter;\
+		array_set<return_t, typename return_t_traits::scalar_type> setter;\
+\
+		return_t output;\
+		for (uint32_t i = 0; i < traits::Dimension; ++i)\
+			setter(output, i, HELPER_NAME<typename traits::scalar_type>::__call(getter(vec, i)));\
+\
+		return output;\
+	}\
+};
+
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(sqrt_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(abs_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(log_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(log2_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(exp2_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(exp_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(floor_helper, T)
+#define INT_VECTOR_RETURN_TYPE vector<int32_t, vector_traits<T>::Dimension>
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(isinf_helper, INT_VECTOR_RETURN_TYPE)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(isnan_helper, INT_VECTOR_RETURN_TYPE)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(cos_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(sin_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(acos_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(modf_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(round_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(roundEven_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(trunc_helper, T)
+AUTO_SPECIALIZE_HELPER_FOR_VECTOR(ceil_helper, T)
+
+#undef INT_VECTOR_RETURN_TYPE
+#undef AUTO_SPECIALIZE_HELPER_FOR_VECTOR
+
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct pow_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
+{
+	using return_t = T;
+	static return_t __call(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<T, typename traits::scalar_type> setter;
+		
+		return_t output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, pow_helper<typename traits::scalar_type>::__call(getter(x, i), getter(y, i)));
+	
+		return output;
+	}
+};
+
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct fma_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
+{
+	using return_t = T;
+	static return_t __call(NBL_CONST_REF_ARG(T) x, NBL_CONST_REF_ARG(T) y, NBL_CONST_REF_ARG(T) z)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<T, typename traits::scalar_type> setter;
+
+		return_t output;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			setter(output, i, fma_helper<typename traits::scalar_type>::__call(getter(x, i), getter(y, i), getter(z, i)));
+
+		return output;
+	}
+};
+
+template<typename T, typename U>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT && (vector_traits<T>::Dimension == vector_traits<U>::Dimension))
+struct ldexp_helper<T, U NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT && (vector_traits<T>::Dimension == vector_traits<U>::Dimension)) >
+{
+	using return_t = T;
+	static return_t __call(NBL_CONST_REF_ARG(T) arg, NBL_CONST_REF_ARG(U) exp)
+	{
+		using arg_traits = hlsl::vector_traits<T>;
+		using exp_traits = hlsl::vector_traits<U>;
+		array_get<T, typename arg_traits::scalar_type> argGetter;
+		array_get<U, typename exp_traits::scalar_type> expGetter;
+		array_set<T, typename arg_traits::scalar_type> setter;
+
+		return_t output;
+		for (uint32_t i = 0; i < arg_traits::Dimension; ++i)
+			setter(output, i, ldexp_helper<typename arg_traits::scalar_type, typename exp_traits::scalar_type>::__call(argGetter(arg, i), expGetter(exp, i)));
+
+		return output;
+	}
+};
+
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct modfStruct_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
+{
+	using return_t = ModfOutput<T>;
+	static return_t __call(NBL_CONST_REF_ARG(T) x)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<T, typename traits::scalar_type> setter;
+
+		T fracPartOut;
+		T intPartOut;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+		{
+			using component_return_t = ModfOutput<typename vector_traits<T>::scalar_type>;
+			component_return_t result = modfStruct_helper<typename traits::scalar_type>::__call(getter(x, i));
+
+			setter(fracPartOut, i, result.fractionalPart);
+			setter(intPartOut, i, result.wholeNumberPart);
+		}
+
+		return_t output;
+		output.fractionalPart = fracPartOut;
+		output.wholeNumberPart = intPartOut;
+
+		return output;
+	}
+};
+
+template<typename T>
+NBL_PARTIAL_REQ_TOP(VECTOR_SPECIALIZATION_CONCEPT)
+struct frexpStruct_helper<T NBL_PARTIAL_REQ_BOT(VECTOR_SPECIALIZATION_CONCEPT) >
+{
+	using return_t = FrexpOutput<T>;
+	static return_t __call(NBL_CONST_REF_ARG(T) x)
+	{
+		using traits = hlsl::vector_traits<T>;
+		array_get<T, typename traits::scalar_type> getter;
+		array_set<T, typename traits::scalar_type> significandSetter;
+		array_set<T, typename traits::scalar_type> exponentSetter;
+
+		T significandOut;
+		T exponentOut;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+		{
+			using component_return_t = FrexpOutput<typename vector_traits<T>::scalar_type>;
+			component_return_t result = frexpStruct_helper<typename traits::scalar_type>::__call(getter(x, i));
+
+			significandSetter(significandOut, i, result.significand);
+			exponentSetter(exponentOut, i, result.exponent);
+		}
+
+		return_t output;
+		output.significand = significandOut;
+		output.exponent = exponentOut;
+
+		return output;
+	}
+};
+
+#undef VECTOR_SPECIALIZATION_CONCEPT
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/tgmath/output_structs.hlsl b/include/nbl/builtin/hlsl/tgmath/output_structs.hlsl
new file mode 100644
index 0000000000..2489aff485
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tgmath/output_structs.hlsl
@@ -0,0 +1,46 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_OUTPUT_STRUCTS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SPIRV_INTRINSICS_OUTPUT_STRUCTS_INCLUDED_
+
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
+#include <nbl/builtin/hlsl/concepts/vector.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct ModfOutput;
+
+template<typename T>  NBL_PARTIAL_REQ_TOP(concepts::FloatingPoint<T>)
+struct ModfOutput<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPoint<T>) >
+{
+	T fractionalPart;
+	T wholeNumberPart;
+};
+
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct FrexpOutput;
+
+template<typename T> NBL_PARTIAL_REQ_TOP(concepts::FloatingPointScalar<T>)
+struct FrexpOutput<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScalar<T>) >
+{
+	T significand;
+	int exponent;
+};
+
+template<typename T> NBL_PARTIAL_REQ_TOP(concepts::FloatingPointVector<T>)
+struct FrexpOutput<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPointVector<T>) >
+{
+	T significand;
+	vector<int, vector_traits<T>::Dimension> exponent;
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl
index 3a9ff1d4ef..d1db2f5b8f 100644
--- a/include/nbl/builtin/hlsl/type_traits.hlsl
+++ b/include/nbl/builtin/hlsl/type_traits.hlsl
@@ -12,7 +12,6 @@ template<typename E>
 concept is_scoped_enum = std::is_enum_v<E> && !std::is_convertible_v<E, std::underlying_type_t<E>>;
 #endif
 
-
 #include <nbl/builtin/hlsl/cpp_compat/basic.h>
 
 
@@ -565,7 +564,7 @@ template<class T>
 using rank = std::rank<T>;
 
 template<class T, unsigned I = 0> 
-using extent = std::extent<T, I>;
+struct extent : std::extent<T, I> {};
 
 template<bool B, class T = void>
 using enable_if = std::enable_if<B, T>;
@@ -744,28 +743,33 @@ struct float_of_size
 {
     using type = void;
 };
-
 template<>
 struct float_of_size<2>
 {
     using type = float16_t;
 };
-
 template<>
 struct float_of_size<4>
 {
     using type = float32_t;
 };
-
 template<>
 struct float_of_size<8>
 {
     using type = float64_t;
 };
-
 template<uint16_t bytesize>
 using float_of_size_t = typename float_of_size<bytesize>::type;
 
+template<typename T, int N>
+struct extent<vector<T, N>, 0> : integral_constant<uint64_t, N> {};
+
+template<typename T, int N, int M>
+struct extent<matrix<T, N, M>, 0> : integral_constant<uint64_t, N> {};
+
+template<typename T, int N, int M>
+struct extent<matrix<T, N, M>, 1> : integral_constant<uint64_t, M> {};
+
 }
 }
 
diff --git a/include/nbl/builtin/hlsl/utility.hlsl b/include/nbl/builtin/hlsl/utility.hlsl
index 487d4a7d75..21f1eb1909 100644
--- a/include/nbl/builtin/hlsl/utility.hlsl
+++ b/include/nbl/builtin/hlsl/utility.hlsl
@@ -13,6 +13,8 @@ namespace nbl
 {
 namespace hlsl
 {
+template<typename T>
+const static bool always_true = true;
 #ifndef __HLSL_VERSION
 
 template<class T>
diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
index a4e54e6b3f..9aefc3b3d8 100644
--- a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
+++ b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
@@ -10,16 +10,22 @@ namespace hlsl
 // The whole purpose of this file is to enable the creation of partial specializations of the vector_traits for 
 // custom types without introducing circular dependencies.
 
-template<typename VecT>
-struct vector_traits;
+template<typename T>
+struct vector_traits
+{
+    using scalar_type = T;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t Dimension = 1u;
+    NBL_CONSTEXPR_STATIC_INLINE bool IsVector = false;
+};
 
 // i choose to implement it this way because of this DXC bug: https://github.com/microsoft/DirectXShaderCom0piler/issues/7007
-#define DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(DIMENSION) \
+#define DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(DIMENSION)\
 template<typename T> \
-struct vector_traits<vector<T, DIMENSION> > \
-{ \
-    using scalar_type = T; \
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t Dimension = DIMENSION; \
+struct vector_traits<vector<T, DIMENSION> >\
+{\
+    using scalar_type = T;\
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t Dimension = DIMENSION;\
+    NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\
 };\
 
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2)
diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_utils.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_utils.hlsl
new file mode 100644
index 0000000000..e1fa9dd3a0
--- /dev/null
+++ b/include/nbl/builtin/hlsl/vector_utils/vector_utils.hlsl
@@ -0,0 +1,21 @@
+#ifndef _NBL_BUILTIN_HLSL_VECTOR_UTILS_VECTOR_UTILS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_VECTOR_UTILS_VECTOR_UTILS_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+
+// TODO: why cant I NBL_CONST_REF_ARG(vector<T, N>)
+template<typename T, uint32_t N>
+inline T lengthsquared(vector<T, N> vec)
+{
+	return dot(vec, vec);
+}
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/workgroup/fft.hlsl b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
index 51cfe45364..15ba0e7d32 100644
--- a/include/nbl/builtin/hlsl/workgroup/fft.hlsl
+++ b/include/nbl/builtin/hlsl/workgroup/fft.hlsl
@@ -1,7 +1,6 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/concepts.hlsl>
 #include <nbl/builtin/hlsl/fft/common.hlsl>
-#include <nbl/builtin/hlsl/bitreverse.hlsl>
 
 #ifndef _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_WORKGROUP_FFT_INCLUDED_
diff --git a/include/nbl/builtin/random/xoroshiro.hlsl b/include/nbl/builtin/random/xoroshiro.hlsl
deleted file mode 100644
index fa917980e4..0000000000
--- a/include/nbl/builtin/random/xoroshiro.hlsl
+++ /dev/null
@@ -1,76 +0,0 @@
-
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef _NBL_BUILTIN_GLSL_RANDOM_XOROSHIRO_HLSL_INCLUDED_
-#define _NBL_BUILTIN_GLSL_RANDOM_XOROSHIRO_HLSL_INCLUDED_
-
-//#include <nbl/builtin/hlsl/math/functions.hlsl>
-
-// TODO[Przemek]: include functions.hlsl instead
-uint32_t rotl(NBL_CONST_REF_ARG(uint32_t) x, NBL_CONST_REF_ARG(uint32_t) k)
-{
-   return (x<<k) | (x>>(32u-k));
-}
-
-namespace nbl
-{
-namespace hlsl
-{
-
-typedef uint2 xoroshiro64star_state_t;
-typedef uint2 xoroshiro64starstar_state_t;
-
-namespace impl
-{
-	uint2 xoroshiro64_state_advance(uint2 state)
-	{
-		state[1] ^= state[0];
-		state[0] = rotl(state[0], 26u) ^ state[1] ^ (state[1]<<9u); // a, b
-		state[1] = rotl(state[1], 13u); // c
-		
-		return state;
-	}
-}
-
-struct Xoroshriro64Star
-{
-	static Xoroshriro64Star construct(xoroshiro64star_state_t initialState)
-	{
-		return { initialState };
-	}
-	
-	uint32_t operator()()
-	{
-		const uint32_t result = state[0]*0x9E3779BBu;
-		state = impl::xoroshiro64_state_advance(state);
-
-		return result;
-	}
-
-	xoroshiro64star_state_t state;
-};
-
-struct Xoroshriro64StarStar
-{
-	static Xoroshriro64StarStar construct(xoroshiro64starstar_state_t initialState)
-	{
-		return { initialState };
-	}
-	
-	uint32_t operator()()
-	{
-		const uint32_t result = rotl(state[0]*0x9E3779BBu,5u)*5u;
-	    state = impl::xoroshiro64_state_advance(state);
-	
-		return result;
-	}
-
-	xoroshiro64starstar_state_t state;
-};
-
-}
-}
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h
index 7c231fea4b..74fdf61693 100644
--- a/include/nbl/core/decl/smart_refctd_ptr.h
+++ b/include/nbl/core/decl/smart_refctd_ptr.h
@@ -144,6 +144,15 @@ smart_refctd_ptr<U> move_and_dynamic_cast(smart_refctd_ptr<T>& smart_ptr);
 template< class U, class T >
 smart_refctd_ptr<U> move_and_dynamic_cast(smart_refctd_ptr<T>&& smart_ptr) {return move_and_dynamic_cast<U,T>(smart_ptr);}
 
+template<typename>
+struct is_smart_refctd_ptr : std::false_type {};
+
+template<typename T>
+struct is_smart_refctd_ptr<smart_refctd_ptr<T>> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<T>::value;
+
 } // end namespace nbl::core
 
 /*
diff --git a/include/nbl/core/declarations.h b/include/nbl/core/declarations.h
index 9aa708a793..466ea988aa 100644
--- a/include/nbl/core/declarations.h
+++ b/include/nbl/core/declarations.h
@@ -50,7 +50,6 @@
 #include "nbl/core/math/colorutil.h"
 #include "nbl/core/math/rational.h"
 #include "nbl/core/math/plane3dSIMD.h"
-#include "nbl/core/math/matrixutil.h"
 // memory
 #include "nbl/core/memory/memory.h"
 #include "nbl/core/memory/new_delete.h"
diff --git a/include/nbl/core/definitions.h b/include/nbl/core/definitions.h
index c08af6ad74..5913c2c8f2 100644
--- a/include/nbl/core/definitions.h
+++ b/include/nbl/core/definitions.h
@@ -15,8 +15,4 @@
 #include "nbl/core/math/floatutil.tcc"
 #include "nbl/core/math/glslFunctions.tcc"
 
-// implementations [deprecated]
-#include "matrix3x4SIMD_impl.h"
-#include "matrix4SIMD_impl.h"
-
 #endif
\ No newline at end of file
diff --git a/include/nbl/core/math/floatutil.h b/include/nbl/core/math/floatutil.h
index d66e3d6275..b65aa84aa6 100644
--- a/include/nbl/core/math/floatutil.h
+++ b/include/nbl/core/math/floatutil.h
@@ -26,8 +26,6 @@ namespace nbl::core
 {
 
 class vectorSIMDf;
-class matrix3x4SIMD;
-class matrix4SIMD;
 
 //! Rounding error constant often used when comparing values. (TODO: remove)
 template<typename T>
@@ -39,9 +37,9 @@ NBL_FORCE_INLINE double ROUNDING_ERROR<double>();
 template<>
 NBL_FORCE_INLINE vectorSIMDf ROUNDING_ERROR<vectorSIMDf>();
 template<>
-NBL_FORCE_INLINE matrix3x4SIMD ROUNDING_ERROR<matrix3x4SIMD>();
+NBL_FORCE_INLINE hlsl::float32_t3x4 ROUNDING_ERROR<hlsl::float32_t3x4>();
 template<>
-NBL_FORCE_INLINE matrix4SIMD ROUNDING_ERROR<matrix4SIMD>();
+NBL_FORCE_INLINE hlsl::float32_t4x4 ROUNDING_ERROR<hlsl::float32_t4x4>();
 
 #ifdef PI // make sure we don't collide with a define
 #undef PI
diff --git a/include/nbl/core/math/floatutil.tcc b/include/nbl/core/math/floatutil.tcc
index 71c8bd2da7..e2b43a25bb 100644
--- a/include/nbl/core/math/floatutil.tcc
+++ b/include/nbl/core/math/floatutil.tcc
@@ -5,9 +5,8 @@
 #ifndef __NBL_CORE_FLOAT_UTIL_TCC_INCLUDED__
 #define __NBL_CORE_FLOAT_UTIL_TCC_INCLUDED__
 
-
 #include "nbl/core/math/floatutil.h"
-#include "matrix4SIMD.h"
+#include "vectorSIMD.h"
 
 namespace nbl
 {
@@ -29,16 +28,6 @@ NBL_FORCE_INLINE vectorSIMDf ROUNDING_ERROR<vectorSIMDf>()
 {
 	return vectorSIMDf(ROUNDING_ERROR<float>());
 }
-template<>
-NBL_FORCE_INLINE matrix3x4SIMD ROUNDING_ERROR<matrix3x4SIMD>()
-{
-	return matrix3x4SIMD(ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>());
-}
-template<>
-NBL_FORCE_INLINE matrix4SIMD ROUNDING_ERROR<matrix4SIMD>()
-{
-	return matrix4SIMD(ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>(),ROUNDING_ERROR<vectorSIMDf>());
-}
 template<typename T>
 NBL_FORCE_INLINE T ROUNDING_ERROR()
 {
diff --git a/include/nbl/core/math/glslFunctions.h b/include/nbl/core/math/glslFunctions.h
index 2bd17cd642..30a0344501 100644
--- a/include/nbl/core/math/glslFunctions.h
+++ b/include/nbl/core/math/glslFunctions.h
@@ -10,6 +10,7 @@
 
 #include "nbl/type_traits.h"
 #include "nbl/core/math/floatutil.h"
+#include "nbl/builtin/hlsl/cpp_compat/matrix.hlsl"
 
 namespace nbl
 {
@@ -21,9 +22,6 @@ class vectorSIMDBool;
 template <class T>
 class vectorSIMD_32;
 class vectorSIMDf;
-class matrix4SIMD;
-class matrix3x4SIMD;
-
 
 template<typename T>
 NBL_FORCE_INLINE T radians(const T& degrees)
@@ -123,17 +121,17 @@ NBL_FORCE_INLINE T mix(const T & a, const T & b, const U & t)
 		}
 		else
 		{
-			if constexpr(nbl::is_any_of<T,matrix4SIMD,matrix3x4SIMD>::value)
+			if constexpr(nbl::is_any_of<T, hlsl::float32_t4x4, hlsl::float32_t3x4>::value)
 			{
 				for (uint32_t i=0u; i<T::VectorCount; i++)
 				{
-					if constexpr(nbl::is_any_of<U, matrix4SIMD, matrix3x4SIMD>::value)
+					if constexpr(nbl::is_any_of<U, hlsl::float32_t4x4, hlsl::float32_t3x4>::value)
 					{
-						retval[i] = core::mix<vectorSIMDf, vectorSIMDf>(a.rows[i], b.rows[i], t.rows[i]);
+						retval[i] = core::mix<vectorSIMDf, vectorSIMDf>(a[i], b[i], t[i]);
 					}
 					else
 					{
-						retval[i] = core::mix<vectorSIMDf, U>(a.rows[i], b.rows[i], t);
+						retval[i] = core::mix<vectorSIMDf, U>(a[i], b[i], t);
 					}
 					
 				}
@@ -317,13 +315,18 @@ NBL_FORCE_INLINE T lerp(const T& a, const T& b, const U& t)
 	return core::mix<T,U>(a,b,t);
 }
 
-
 // TODO : step,smoothstep,isnan,isinf,floatBitsToInt,floatBitsToUint,intBitsToFloat,uintBitsToFloat,frexp,ldexp
 // extra note, GCC breaks isfinite, isinf, isnan, isnormal, signbit in -ffast-math so need to implement ourselves
 // TODO : packUnorm2x16, packSnorm2x16, packUnorm4x8, packSnorm4x8, unpackUnorm2x16, unpackSnorm2x16, unpackUnorm4x8, unpackSnorm4x8, packHalf2x16, unpackHalf2x16, packDouble2x32, unpackDouble2x32
 // MOVE : faceforward, reflect, refract, any, all, not
 template<typename T>
-NBL_FORCE_INLINE T dot(const T& a, const T& b);
+NBL_FORCE_INLINE T dot(const T& a, const T& b)
+{
+	static_assert(!(std::is_same_v<T, hlsl::float32_t2> || std::is_same_v<T, hlsl::float32_t3> || std::is_same_v<T, hlsl::float32_t4>));
+
+	return T(0);
+}
+
 template<>
 NBL_FORCE_INLINE vectorSIMDf dot<vectorSIMDf>(const vectorSIMDf& a, const vectorSIMDf& b);
 template<>
@@ -331,7 +334,6 @@ NBL_FORCE_INLINE vectorSIMD_32<int32_t> dot<vectorSIMD_32<int32_t>>(const vector
 template<>
 NBL_FORCE_INLINE vectorSIMD_32<uint32_t> dot<vectorSIMD_32<uint32_t>>(const vectorSIMD_32<uint32_t>& a, const vectorSIMD_32<uint32_t>& b);
 
-
 template<typename T>
 NBL_FORCE_INLINE T lengthsquared(const T& v)
 {
@@ -362,7 +364,7 @@ NBL_FORCE_INLINE vectorSIMDf cross<vectorSIMDf>(const vectorSIMDf& a, const vect
 template<typename T>
 NBL_FORCE_INLINE T normalize(const T& v)
 {
-	auto d = dot<T>(v, v);
+	auto d = core::dot<T>(v, v);
 #ifdef __NBL_FAST_MATH
 	return v * core::inversesqrt<T>(d);
 #else
@@ -373,11 +375,6 @@ NBL_FORCE_INLINE T normalize(const T& v)
 // TODO : matrixCompMult, outerProduct, inverse
 template<typename T>
 NBL_FORCE_INLINE T transpose(const T& m);
-template<>
-NBL_FORCE_INLINE matrix4SIMD transpose(const matrix4SIMD& m);
-
-
-
 
 // Extras
 
@@ -424,10 +421,6 @@ template<typename T>
 NBL_FORCE_INLINE bool equals(const T& a, const T& b, const T& tolerance);
 template<>
 NBL_FORCE_INLINE bool equals<vectorSIMDf>(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& tolerance);
-template<>
-NBL_FORCE_INLINE bool equals<matrix4SIMD>(const matrix4SIMD& a, const matrix4SIMD& b, const matrix4SIMD& tolerance);
-template<>
-NBL_FORCE_INLINE bool equals<matrix3x4SIMD>(const matrix3x4SIMD& a, const matrix3x4SIMD& b, const matrix3x4SIMD& tolerance);
 
 
 //! returns if a equals zero, taking rounding errors into account
diff --git a/include/nbl/core/math/glslFunctions.tcc b/include/nbl/core/math/glslFunctions.tcc
index 205585965b..5c86d4c6c1 100644
--- a/include/nbl/core/math/glslFunctions.tcc
+++ b/include/nbl/core/math/glslFunctions.tcc
@@ -8,7 +8,6 @@
 #include "nbl/core/declarations.h"
 
 #include "nbl/core/math/floatutil.tcc"
-#include "matrix4SIMD.h"
 
 #include <cmath>
 #include <numeric>
@@ -229,7 +228,6 @@ NBL_FORCE_INLINE T max(const T& a, const T& b)
 	return core::mix<T,bool_type>(a,vb,asmaller);
 }
 
-
 template<>
 NBL_FORCE_INLINE vectorSIMDf dot<vectorSIMDf>(const vectorSIMDf& a, const vectorSIMDf& b)
 {
@@ -280,21 +278,6 @@ NBL_FORCE_INLINE vectorSIMDf cross<vectorSIMDf>(const vectorSIMDf& a, const vect
 #endif
 }
 
-template<>
-NBL_FORCE_INLINE matrix4SIMD transpose(const matrix4SIMD& m)
-{
-	core::matrix4SIMD retval;
-	__m128 a0 = m.rows[0].getAsRegister(), a1 = m.rows[1].getAsRegister(), a2 = m.rows[2].getAsRegister(), a3 = m.rows[3].getAsRegister();
-	_MM_TRANSPOSE4_PS(a0, a1, a2, a3);
-	retval.rows[0] = a0;
-	retval.rows[1] = a1;
-	retval.rows[2] = a2;
-	retval.rows[3] = a3;
-	return retval;
-}
-
-
-
 template<>
 NBL_FORCE_INLINE bool equals<vectorSIMDf>(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& tolerance)
 {
@@ -307,29 +290,12 @@ NBL_FORCE_INLINE bool equals(const core::vector3df& a, const core::vector3df& b,
 	auto la = a-tolerance;
 	return ha.X>=b.X&&ha.Y>=b.Y&&ha.Z>=b.Z && la.X<=b.X&&la.Y<=b.Y&&la.Z<=b.Z;
 }
-template<>
-NBL_FORCE_INLINE bool equals<matrix4SIMD>(const matrix4SIMD& a, const matrix4SIMD& b, const matrix4SIMD& tolerance)
-{
-	for (size_t i = 0u; i<matrix4SIMD::VectorCount; ++i)
-		if (!equals<vectorSIMDf>(a.rows[i], b.rows[i], tolerance.rows[i]))
-			return false;
-	return true;
-}
-template<>
-NBL_FORCE_INLINE bool equals<matrix3x4SIMD>(const matrix3x4SIMD& a, const matrix3x4SIMD& b, const matrix3x4SIMD& tolerance)
-{
-	for (size_t i = 0u; i<matrix3x4SIMD::VectorCount; ++i)
-		if (!equals<vectorSIMDf>(a.rows[i], b.rows[i], tolerance[i]))
-			return false;
-	return true;
-}
 template<typename T>
 NBL_FORCE_INLINE bool equals(const T& a, const T& b, const T& tolerance)
 {
 	return (a + tolerance >= b) && (a - tolerance <= b);
 }
 
-
 template<>
 NBL_FORCE_INLINE vectorSIMDf sin<vectorSIMDf>(const vectorSIMDf& a)
 {
@@ -342,8 +308,6 @@ NBL_FORCE_INLINE T sin(const T& a)
 	return std::sin(a);
 }
 
-
-
 // extras
 
 
diff --git a/include/nbl/core/math/intutil.h b/include/nbl/core/math/intutil.h
index 7a94844258..cb7e17728a 100644
--- a/include/nbl/core/math/intutil.h
+++ b/include/nbl/core/math/intutil.h
@@ -8,6 +8,17 @@
 
 #include "nbl/builtin/hlsl/math/intutil.hlsl"
 
+#include "nbl/builtin/hlsl/cpp_compat/intrinsics.h"
+#include "nbl/macros.h"
+#include "nbl/core/math/glslFunctions.h"
+
+#include <cstdint>
+#include <limits.h> // For INT_MAX / UINT_MAX
+#include <initializer_list>
+#include <type_traits>
+#ifdef _MSC_VER
+    #include <intrin.h>
+#endif
 
 namespace nbl
 {
diff --git a/include/nbl/core/math/matrixutil.h b/include/nbl/core/math/matrixutil.h
deleted file mode 100644
index afe7955c9b..0000000000
--- a/include/nbl/core/math/matrixutil.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-
-#ifndef _NBL_MATRIX_UTIL_H_INCLUDED_
-#define _NBL_MATRIX_UTIL_H_INCLUDED_
-
-#include "matrix4SIMD.h"
-#include "matrix3x4SIMD.h"
-
-namespace nbl::core
-{
-
-
-//! TODO: OPTIMIZE THIS, DON'T PROMOTE THE MATRIX IF DON'T HAVE TO
-inline matrix4SIMD concatenateBFollowedByA(const matrix4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return concatenateBFollowedByA(_a, matrix4SIMD(_b));
-}
-/*
-inline matrix4SIMD concatenateBFollowedByAPrecisely(const matrix4SIMD& _a, const matrix3x4SIMD& _b)
-{
-    return concatenateBFollowedByAPrecisely(_a, matrix4SIMD(_b));
-}
-*/
-
-}
-
-#endif
diff --git a/include/nbl/core/math/plane3dSIMD.h b/include/nbl/core/math/plane3dSIMD.h
index 891ed1300c..0a9f163208 100644
--- a/include/nbl/core/math/plane3dSIMD.h
+++ b/include/nbl/core/math/plane3dSIMD.h
@@ -6,7 +6,9 @@
 #ifndef __NBL_CORE_PLANE_3D_H_INCLUDED__
 #define __NBL_CORE_PLANE_3D_H_INCLUDED__
 
-#include "matrix3x4SIMD.h"
+#include <vectorSIMD.h>
+#include <nbl/core/math/glslFunctions.h>
+#include <nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl>
 
 namespace nbl
 {
@@ -99,14 +101,20 @@ class plane3dSIMDf : private vectorSIMDf
 		}
 
 		//!
-        static inline plane3dSIMDf transform(const plane3dSIMDf& _in, const matrix3x4SIMD& _mat)
+        static inline plane3dSIMDf transform(const plane3dSIMDf& in, const hlsl::float32_t3x4& mat)
         {
-            matrix3x4SIMD inv;
-            _mat.getInverse(inv);
+			hlsl::float32_t3x4 a = mat;
+			hlsl::float32_t4x4 inv = hlsl::getMatrix3x4As4x4<hlsl::float32_t>(a);
+            hlsl::inverse(inv);
 
-            vectorSIMDf normal(_in.getNormal());
+            vectorSIMDf normal(in.getNormal());
             // transform by inverse transpose
-            return plane3dSIMDf(inv.rows[0]*normal.xxxx()+inv.rows[1]*normal.yyyy()+inv.rows[2]*normal.zzzz()+(normal.wwww()&BUILD_MASKF(0,0,0,1)));
+			hlsl::float32_t4 planeEq = inv[0] * hlsl::float32_t4(normal.x) + inv[1] * hlsl::float32_t4(normal.y) + inv[2] * hlsl::float32_t4(normal.z) + (hlsl::float32_t4(0, 0, 0, normal.w));
+			vectorSIMDf planeEqSIMD;
+			for (int i = 0; i < 4; ++i)
+				planeEqSIMD[i] = planeEq[i];
+
+            return plane3dSIMDf(planeEqSIMD);
 		    #undef BUILD_MASKF
         }
 
diff --git a/include/nbl/core/util/bitflag.h b/include/nbl/core/util/bitflag.h
index 209879966d..1731c0cac3 100644
--- a/include/nbl/core/util/bitflag.h
+++ b/include/nbl/core/util/bitflag.h
@@ -59,18 +59,20 @@ namespace nbl::hlsl::cpp_compat_intrinsics_impl
 	template<typename ENUM_TYPE>
 	struct find_lsb_helper<core::bitflag<ENUM_TYPE>>
 	{
-		static int32_t findLSB(NBL_CONST_REF_ARG(core::bitflag<ENUM_TYPE>) val)
+		using return_t = int32_t;
+		static return_t __call(NBL_CONST_REF_ARG(core::bitflag<ENUM_TYPE>) val)
 		{
-			return find_lsb_helper<ENUM_TYPE>::findLSB(val.value);
+			return find_lsb_helper<ENUM_TYPE>::__call(val.value);
 		}
 	};
 
 	template<typename ENUM_TYPE>
 	struct find_msb_helper<core::bitflag<ENUM_TYPE>>
 	{
-		static int32_t findMSB(NBL_CONST_REF_ARG(core::bitflag<ENUM_TYPE>) val)
+		using return_t = int32_t;
+		static return_t __call(NBL_CONST_REF_ARG(core::bitflag<ENUM_TYPE>) val)
 		{
-			return find_msb_helper<ENUM_TYPE>::findMSB(val.value);
+			return find_msb_helper<ENUM_TYPE>::__call(val.value);
 		}
 	};
 }
diff --git a/include/nbl/ext/DebugDraw/CDraw3DLine.h b/include/nbl/ext/DebugDraw/CDraw3DLine.h
index 68cd64e9c1..2437ce4bc5 100644
--- a/include/nbl/ext/DebugDraw/CDraw3DLine.h
+++ b/include/nbl/ext/DebugDraw/CDraw3DLine.h
@@ -91,7 +91,7 @@ class CDraw3DLine : public core::IReferenceCounted
 		*/
 		void recordToCommandBuffer(video::IGPUCommandBuffer* cmdBuffer, video::IGPUGraphicsPipeline* graphics_pipeline);
 
-		inline void addBox(const core::aabbox3df& box, float r, float g, float b, float a, const core::matrix3x4SIMD& tform=core::matrix3x4SIMD())
+		inline void addBox(const core::aabbox3df& box, float r, float g, float b, float a, const hlsl::float32_t3x4& tform=hlsl::float32_t3x4())
 		{
 			auto addLine = [&](auto s, auto e) -> void
 			{
diff --git a/include/nbl/ext/MitsubaLoader/CElementShape.h b/include/nbl/ext/MitsubaLoader/CElementShape.h
index 205023afea..c1725963b2 100644
--- a/include/nbl/ext/MitsubaLoader/CElementShape.h
+++ b/include/nbl/ext/MitsubaLoader/CElementShape.h
@@ -225,7 +225,7 @@ class CElementShape : public IElement
 		std::string getLogName() const override { return "shape"; }
 
 		
-		inline core::matrix3x4SIMD getAbsoluteTransform() const
+		inline hlsl::float32_t3x4 getAbsoluteTransform() const
 		{
 			auto local = transform.matrix.extractSub3x4();
 			// TODO restore at some point (and make it actually work??)
diff --git a/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h b/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h
index 5deb0982db..3021cc42ec 100644
--- a/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h
+++ b/include/nbl/ext/MitsubaLoader/CMitsubaLoader.h
@@ -13,12 +13,10 @@
 #include "nbl/ext/MitsubaLoader/CMitsubaMetadata.h"
 #include "nbl/ext/MitsubaLoader/CElementShape.h"
 #include "nbl/ext/MitsubaLoader/SContext.h"
-
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 
 namespace nbl::ext::MitsubaLoader
 {
-
-
 class CElementBSDF;
 class CMitsubaMaterialCompilerFrontend;
 
@@ -27,7 +25,7 @@ class CMitsubaMaterialCompilerFrontend;
 //#include "nbl/builtin/glsl/ext/MitsubaLoader/instance_data_struct.glsl"
 #define uint uint32_t
 #define uvec2 uint64_t
-#define mat4x3 nbl::core::matrix3x4SIMD
+#define mat4x3 nbl::hlsl::float32_t3x4
 #define nbl_glsl_MC_material_data_t asset::material_compiler::material_data_t
 struct nbl_glsl_ext_Mitsuba_Loader_instance_data_t
 {
@@ -70,8 +68,8 @@ class CMitsubaLoader : public asset::IRenderpassIndependentPipelineLoader
 
 		//
 		core::vector<SContext::shape_ass_type>	getMesh(SContext& ctx, uint32_t hierarchyLevel, CElementShape* shape, const system::logger_opt_ptr& logger);
-		core::vector<SContext::shape_ass_type>	loadShapeGroup(SContext& ctx, uint32_t hierarchyLevel, const CElementShape::ShapeGroup* shapegroup, const core::matrix3x4SIMD& relTform, const system::logger_opt_ptr& _logger);
-		SContext::shape_ass_type				loadBasicShape(SContext& ctx, uint32_t hierarchyLevel, CElementShape* shape, const core::matrix3x4SIMD& relTform, const system::logger_opt_ptr& logger);
+		core::vector<SContext::shape_ass_type>	loadShapeGroup(SContext& ctx, uint32_t hierarchyLevel, const CElementShape::ShapeGroup* shapegroup, const hlsl::float32_t3x4& relTform, const system::logger_opt_ptr& _logger);
+		SContext::shape_ass_type				loadBasicShape(SContext& ctx, uint32_t hierarchyLevel, CElementShape* shape, const hlsl::float32_t3x4& relTform, const system::logger_opt_ptr& logger);
 		
 		void									cacheTexture(SContext& ctx, uint32_t hierarchyLevel, const CElementTexture* texture, const CMitsubaMaterialCompilerFrontend::E_IMAGE_VIEW_SEMANTIC semantic);
 
diff --git a/include/nbl/ext/MitsubaLoader/SContext.h b/include/nbl/ext/MitsubaLoader/SContext.h
index 5ec0096ba4..5423af5fca 100644
--- a/include/nbl/ext/MitsubaLoader/SContext.h
+++ b/include/nbl/ext/MitsubaLoader/SContext.h
@@ -190,7 +190,7 @@ namespace MitsubaLoader
 
 		struct SInstanceData
 		{
-			SInstanceData(core::matrix3x4SIMD _tform, SContext::bsdf_type _bsdf, const std::string& _id, const CElementEmitter& _emitterFront, const CElementEmitter& _emitterBack) :
+			SInstanceData(hlsl::float32_t3x4 _tform, SContext::bsdf_type _bsdf, const std::string& _id, const CElementEmitter& _emitterFront, const CElementEmitter& _emitterBack) :
 				tform(_tform), bsdf(_bsdf),
 #if defined(_NBL_DEBUG) || defined(_NBL_RELWITHDEBINFO)
 				bsdf_id(_id),
@@ -198,7 +198,7 @@ namespace MitsubaLoader
 				emitter{_emitterFront, _emitterBack}
 			{}
 
-			core::matrix3x4SIMD tform;
+			hlsl::float32_t3x4 tform;
 			SContext::bsdf_type bsdf;
 #if defined(_NBL_DEBUG) || defined(_NBL_RELWITHDEBINFO)
 			std::string bsdf_id;
diff --git a/include/nbl/scene/ILevelOfDetailLibrary.h b/include/nbl/scene/ILevelOfDetailLibrary.h
index 3276e3bf4f..e7f76e92a0 100644
--- a/include/nbl/scene/ILevelOfDetailLibrary.h
+++ b/include/nbl/scene/ILevelOfDetailLibrary.h
@@ -6,6 +6,7 @@
 
 #include "nbl/video/ILogicalDevice.h"
 #include "nbl/video/utilities/IDrawIndirectAllocator.h"
+#include "nbl/builtin/hlsl/cpp_compat/intrinsics.h"
 
 namespace nbl::scene
 {
@@ -26,11 +27,11 @@ class ILevelOfDetailLibrary : public virtual core::IReferenceCounted
 				return distanceSqAtReferenceFoV<other.distanceSqAtReferenceFoV;
 			}
 
-			static inline float getFoVDilationFactor(const core::matrix4SIMD& proj)
+			static inline float getFoVDilationFactor(const hlsl::float32_t3x4& proj)
 			{
-				if (proj.rows[3].w!=0.f)
+				if (proj[3].w!=0.f)
 					return core::nan<float>();
-				return abs(proj.rows[0].x*proj.rows[1].y-proj.rows[0].y*proj.rows[1].x)/dot(proj.rows[3],proj.rows[3]).x;
+				return abs(proj[0].x*proj[1].y-proj[0].y*proj[1].x)/hlsl::dot(proj[3],proj[3]);
 			}
 		};
 		template<typename InfoType, template<class...> class container=core::vector>
diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 5237b5d6f1..8c0311e114 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -279,7 +279,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 							// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03809
 							// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pInfos-03810
 							// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBuildAccelerationStructuresKHR-pInfos-03773
-							if (Base::invalidInputBuffer(geometry.transform,buildRangeInfo.transformByteOffset,1u,sizeof(core::matrix3x4SIMD),sizeof(core::vectorSIMDf)))
+							if (Base::invalidInputBuffer(geometry.transform,buildRangeInfo.transformByteOffset,1u,sizeof(hlsl::float32_t3x4),sizeof(core::vectorSIMDf)))
 								return false;
 						}
 						else
diff --git a/include/quaternion.h b/include/quaternion.h
deleted file mode 100644
index c1867235db..0000000000
--- a/include/quaternion.h
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright (C) 2019 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine" and was originally part of the "Irrlicht Engine"
-// For conditions of distribution and use, see copyright notice in nabla.h
-// See the original file in irrlicht source for authors
-
-#ifndef __NBL_QUATERNION_H_INCLUDED__
-#define __NBL_QUATERNION_H_INCLUDED__
-
-
-#include "vectorSIMD.h"
-
-#include "nbl/core/math/glslFunctions.h"
-
-
-namespace nbl
-{
-namespace core
-{
-
-class matrix3x4SIMD;
-
-
-//! Quaternion class for representing rotations.
-/** It provides cheap combinations and avoids gimbal locks.
-Also useful for interpolations. */
-class quaternion : private vectorSIMDf
-{
-	public:
-		//! Default Constructor
-		inline quaternion() : vectorSIMDf(0,0,0,1) {}
-
-		inline quaternion(const quaternion& other) : vectorSIMDf(static_cast<const vectorSIMDf&>(other)) {}
-
-		inline quaternion(const float* data) : vectorSIMDf(data) {}
-
-		//! Constructor
-		inline quaternion(const float& x, const float& y, const float& z, const float& w) : vectorSIMDf(x,y,z,w) { }
-
-		//! Constructor which converts euler angles (radians) to a quaternion
-		inline quaternion(const float& pitch, const float& yaw, const float& roll) {set(pitch,yaw,roll);}
-
-		//! Constructor which converts a matrix to a quaternion
-		explicit quaternion(const matrix3x4SIMD& m);
-
-      		inline float* getPointer() {return pointer;}
-
-		//! Equalilty operator
-		inline vector4db_SIMD operator==(const quaternion& other) const {return vectorSIMDf::operator==(other);}
-
-		//! inequality operator
-		inline vector4db_SIMD operator!=(const quaternion& other) const {return vectorSIMDf::operator!=(other);}
-
-		//! Assignment operator
-		inline quaternion& operator=(const quaternion& other) {return reinterpret_cast<quaternion&>(vectorSIMDf::operator=(other));}
-
-		//! Multiplication operator with scalar
-		inline quaternion operator*(const float& s) const
-		{
-		    quaternion tmp;
-		    reinterpret_cast<vectorSIMDf&>(tmp) = reinterpret_cast<const vectorSIMDf*>(this)->operator*(s);
-		    return tmp;
-		}
-
-		//! Multiplication operator with scalar
-		inline quaternion& operator*=(const float& s)
-		{
-		    *this = (*this)*s;
-		    return *this;
-		}
-
-		//! Multiplication operator
-		inline quaternion& operator*=(const quaternion& other)
-		{
-		    *this = (*this)*other;
-		    return *this;
-		}
-
-		//! Multiplication operator
-		//http://momchil-velikov.blogspot.fr/2013/10/fast-sse-quternion-multiplication.html
-		inline quaternion operator*(const quaternion& other) const
-        {
-            __m128 xyzw = vectorSIMDf::getAsRegister();
-            __m128 abcd = reinterpret_cast<const vectorSIMDf&>(other).getAsRegister();
-
-          __m128 t0 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (3, 3, 3, 3)); /* 1, 0.5 */
-          __m128 t1 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (2, 3, 0, 1)); /* 1, 0.5 */
-
-          __m128 t3 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (0, 0, 0, 0)); /* 1, 0.5 */
-          __m128 t4 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (1, 0, 3, 2)); /* 1, 0.5 */
-
-          __m128 t5 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (1, 1, 1, 1)); /* 1, 0.5 */
-          __m128 t6 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (2, 0, 3, 1)); /* 1, 0.5 */
-
-          /* [d,d,d,d]*[z,w,x,y] = [dz,dw,dx,dy] */
-          __m128 m0 = _mm_mul_ps (t0, t1); /* 5/4, 1 */
-
-          /* [a,a,a,a]*[y,x,w,z] = [ay,ax,aw,az]*/
-          __m128 m1 = _mm_mul_ps (t3, t4); /* 5/4, 1 */
-
-          /* [b,b,b,b]*[z,x,w,y] = [bz,bx,bw,by]*/
-          __m128 m2 = _mm_mul_ps (t5, t6); /* 5/4, 1 */
-
-          /* [c,c,c,c]*[w,z,x,y] = [cw,cz,cx,cy] */
-          __m128 t7 = FAST_FLOAT_SHUFFLE(abcd, _MM_SHUFFLE (2, 2, 2, 2)); /* 1, 0.5 */
-          __m128 t8 = FAST_FLOAT_SHUFFLE(xyzw, _MM_SHUFFLE (3, 2, 0, 1)); /* 1, 0.5 */
-
-          __m128 m3 = _mm_mul_ps (t7, t8); /* 5/4, 1 */
-
-          /* 1 */
-          /* [dz,dw,dx,dy]+-[ay,ax,aw,az] = [dz+ay,dw-ax,dx+aw,dy-az] */
-          __m128 e = _mm_addsub_ps (m0, m1); /* 3, 1 */
-
-          /* 2 */
-          /* [dx+aw,dz+ay,dy-az,dw-ax] */
-          e = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (1, 3, 0, 2)); /* 1, 0.5 */
-
-          /* [dx+aw,dz+ay,dy-az,dw-ax]+-[bz,bx,bw,by] = [dx+aw+bz,dz+ay-bx,dy-az+bw,dw-ax-by]*/
-          e = _mm_addsub_ps (e, m2); /* 3, 1 */
-
-          /* 2 */
-          /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz] */
-          e = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (2, 0, 1, 3)); /* 1, 0.5 */
-
-          /* [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz]+-[cw,cz,cx,cy]
-             = [dz+ay-bx+cw,dw-ax-by-cz,dy-az+bw+cx,dx+aw+bz-cy] */
-          e = _mm_addsub_ps (e, m3); /* 3, 1 */
-
-          /* 2 */
-          /* [dw-ax-by-cz,dz+ay-bx+cw,dy-az+bw+cx,dx+aw+bz-cy] */
-          quaternion tmp;
-          reinterpret_cast<vectorSIMDf&>(tmp) = FAST_FLOAT_SHUFFLE(e, _MM_SHUFFLE (2, 3, 1, 0)); /* 1, 0.5 */
-          return tmp;
-        }
-
-        inline vectorSIMDf transformVect(const vectorSIMDf& vec)
-        {
-            vectorSIMDf direction = *reinterpret_cast<const vectorSIMDf*>(this);
-            vectorSIMDf scale = core::length(direction);
-            direction.makeSafe3D();
-
-            return scale*vec+cross(direction,vec*W+cross(direction,vec))*2.f;
-        }
-
-		//! Sets new quaternion
-		inline quaternion& set(const vectorSIMDf& xyzw)
-		{
-		    *this = reinterpret_cast<const quaternion&>(xyzw);
-		    return *this;
-		}
-
-		//! Sets new quaternion based on euler angles (radians)
-		inline quaternion& set(const float& roll, const float& pitch, const float& yaw);
-
-		//! Sets new quaternion from other quaternion
-		inline quaternion& set(const quaternion& quat)
-		{
-		    *this = quat;
-		    return *this;
-		}
-
-		//! Inverts this quaternion
-		inline void makeInverse()
-		{
-		    reinterpret_cast<vectorSIMDf&>(*this) ^= _mm_set_epi32(0x0u,0x80000000u,0x80000000u,0x80000000u);
-		}
-
-		//! Fills an angle (radians) around an axis (unit vector)
-		void toAngleAxis(float& angle, vector3df_SIMD& axis) const;
-
-		//! Output this quaternion to an euler angle (radians)
-		void toEuler(vector3df_SIMD& euler) const;
-
-		//! Set quaternion to identity
-		inline void makeIdentity() {vectorSIMDf::set(0,0,0,1);}
-
-
-        vectorSIMDf& getData() {return *((vectorSIMDf*)this);}
-
-//statics
-        inline static quaternion normalize(const quaternion& in)
-        {
-            quaternion tmp;
-            reinterpret_cast<vectorSIMDf&>(tmp) = core::normalize(reinterpret_cast<const vectorSIMDf&>(in));
-            return tmp;
-        }
-
-        //! Helper func
-		static quaternion lerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const bool& wrongDoubleCover);
-
-		//! Set this quaternion to the linear interpolation between two quaternions
-		/** \param q1 First quaternion to be interpolated.
-		\param q2 Second quaternion to be interpolated.
-		\param interpolant Progress of interpolation. For interpolant=0 the result is
-		q1, for interpolant=1 the result is q2. Otherwise interpolation
-		between q1 and q2.
-		*/
-		static quaternion lerp(const quaternion &q1, const quaternion &q2, const float& interpolant);
-
-        //! Helper func
-		static inline void flerp_interpolant_terms(float& interpolantPrecalcTerm2, float& interpolantPrecalcTerm3, const float& interpolant)
-        {
-            interpolantPrecalcTerm2 = (interpolant - 0.5f) * (interpolant - 0.5f);
-            interpolantPrecalcTerm3 = interpolant * (interpolant - 0.5f) * (interpolant - 1.f);
-        }
-
-		static float flerp_adjustedinterpolant(const float& angle, const float& interpolant, const float& interpolantPrecalcTerm2, const float& interpolantPrecalcTerm3);
-
-		//! Set this quaternion to the approximate slerp between two quaternions
-		/** \param q1 First quaternion to be interpolated.
-		\param q2 Second quaternion to be interpolated.
-		\param interpolant Progress of interpolation. For interpolant=0 the result is
-		q1, for interpolant=1 the result is q2. Otherwise interpolation
-		between q1 and q2.
-		*/
-		static quaternion flerp(const quaternion &q1, const quaternion &q2, const float& interpolant);
-
-		//! Set this quaternion to the result of the spherical interpolation between two quaternions
-		/** \param q1 First quaternion to be interpolated.
-		\param q2 Second quaternion to be interpolated.
-		\param time Progress of interpolation. For interpolant=0 the result is
-		q1, for interpolant=1 the result is q2. Otherwise interpolation
-		between q1 and q2.
-		\param threshold To avoid inaccuracies the
-		interpolation switches to linear interpolation at some point.
-		This value defines how much of the interpolation will
-		be calculated with lerp.
-		*/
-		static quaternion slerp(const quaternion& q1, const quaternion& q2,
-				const float& interpolant, const float& threshold=.05f);
-
-		inline static quaternion fromEuler(const vector3df_SIMD& euler)
-		{
-		    quaternion tmp;
-		    tmp.set(euler.X,euler.Y,euler.Z);
-		    return tmp;
-        }
-
-		inline static quaternion fromEuler(const vector3df& euler)
-		{
-		    quaternion tmp;
-		    tmp.set(euler.X,euler.Y,euler.Z);
-		    return tmp;
-        }
-
-		//! Set quaternion to represent a rotation from one vector to another.
-		static quaternion rotationFromTo(const vector3df_SIMD& from, const vector3df_SIMD& to);
-
-		//! Create quaternion from rotation angle and rotation axis.
-		/** Axis must be unit length.
-		The quaternion representing the rotation is
-		q = cos(A/2)+sin(A/2)*(x*i+y*j+z*k).
-		\param angle Rotation Angle in radians.
-		\param axis Rotation axis. */
-		static quaternion fromAngleAxis(const float& angle, const vector3df_SIMD& axis);
-};
-static_assert(sizeof(quaternion) == sizeof(vectorSIMDf), "Quaternion not same size as vec4");
-
-
-// set this quaternion to the result of the linear interpolation between two quaternions
-inline quaternion quaternion::lerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const bool& wrongDoubleCover)
-{
-    vectorSIMDf retval;
-	if (wrongDoubleCover)
-        retval = mix<vectorSIMDf>(reinterpret_cast<const vectorSIMDf&>(q1),-reinterpret_cast<const vectorSIMDf&>(q2),vectorSIMDf(interpolant));
-    else
-        retval = mix<vectorSIMDf>(reinterpret_cast<const vectorSIMDf&>(q1), reinterpret_cast<const vectorSIMDf&>(q2),vectorSIMDf(interpolant));
-    return reinterpret_cast<const quaternion&>(retval);
-}
-
-// set this quaternion to the result of the linear interpolation between two quaternions
-inline quaternion quaternion::lerp(const quaternion &q1, const quaternion &q2, const float& interpolant)
-{
-	const float angle = dot<vectorSIMDf>(q1,q2)[0];
-    return lerp(q1,q2,interpolant,angle < 0.0f);
-}
-
-// Arseny Kapoulkine
-inline float quaternion::flerp_adjustedinterpolant(const float& angle, const float& interpolant, const float& interpolantPrecalcTerm2, const float& interpolantPrecalcTerm3)
-{
-    float A = 1.0904f + angle * (-3.2452f + angle * (3.55645f - angle * 1.43519f));
-    float B = 0.848013f + angle * (-1.06021f + angle * 0.215638f);
-    float k = A * interpolantPrecalcTerm2 + B;
-    float ot = interpolant + interpolantPrecalcTerm3 * k;
-    return ot;
-}
-
-// set this quaternion to the result of an approximate slerp
-inline quaternion quaternion::flerp(const quaternion &q1, const quaternion &q2, const float& interpolant)
-{
-	const float angle = dot<vectorSIMDf>(q1,q2)[0];
-    return lerp(q1,q2,flerp_adjustedinterpolant(fabsf(angle),interpolant,(interpolant - 0.5f) * (interpolant - 0.5f),interpolant * (interpolant - 0.5f) * (interpolant - 1.f)),angle < 0.0f);
-}
-
-
-// set this quaternion to the result of the interpolation between two quaternions
-inline quaternion quaternion::slerp(const quaternion &q1, const quaternion &q2, const float& interpolant, const float& threshold)
-{
-	float angle = dot<vectorSIMDf>(q1,q2)[0];
-
-	// make sure we use the short rotation
-	bool wrongDoubleCover = angle < 0.0f;
-	if (wrongDoubleCover)
-		angle *= -1.f;
-
-	if (angle <= (1.f-threshold)) // spherical interpolation
-	{ // acosf + sinf
-        vectorSIMDf retval;
-
-		const float sinARcp  = inversesqrt(1.f-angle*angle);
-		const float sinAt = sinf(acosf(angle) * interpolant); // could this line be optimized?
-		//1sqrt 3min/add 5mul from now on
-		const float sinAt_over_sinA = sinAt*sinARcp;
-
-        const float scale = core::sqrt(1.f-sinAt*sinAt)-angle*sinAt_over_sinA; //cosAt-cos(A)sin(tA)/sin(A) = (sin(A)cos(tA)-cos(A)sin(tA))/sin(A)
-        if (wrongDoubleCover) // make sure we use the short rotation
-            retval = reinterpret_cast<const vectorSIMDf&>(q1)*scale - reinterpret_cast<const vectorSIMDf&>(q2)*sinAt_over_sinA;
-        else
-            retval = reinterpret_cast<const vectorSIMDf&>(q1)*scale + reinterpret_cast<const vectorSIMDf&>(q2)*sinAt_over_sinA;
-
-        return reinterpret_cast<const quaternion&>(retval);
-	}
-	else
-        return normalize(lerp(q1,q2,interpolant,wrongDoubleCover));
-}
-
-
-#if !NBL_TEST_BROKEN_QUATERNION_USE
-//! axis must be unit length, angle in radians
-inline quaternion quaternion::fromAngleAxis(const float& angle, const vector3df_SIMD& axis)
-{
-	const float fHalfAngle = 0.5f*angle;
-	const float fSin = sinf(fHalfAngle);
-	quaternion retval;
-	reinterpret_cast<vectorSIMDf&>(retval) = axis*fSin;
-	reinterpret_cast<vectorSIMDf&>(retval).W = cosf(fHalfAngle);
-	return retval;
-}
-
-
-inline void quaternion::toAngleAxis(float& angle, vector3df_SIMD &axis) const
-{
-    vectorSIMDf scale = core::length(*reinterpret_cast<const vectorSIMDf*>(this));
-
-	if (scale.X==0.f)
-	{
-		angle = 0.0f;
-		axis.X = 0.0f;
-		axis.Y = 1.0f;
-		axis.Z = 0.0f;
-	}
-	else
-	{
-	    axis = reinterpret_cast<const vectorSIMDf*>(this)->operator/(scale);
-		angle = 2.f * acosf(axis.W);
-
-	    axis.makeSafe3D();
-	}
-}
-
-inline void quaternion::toEuler(vector3df_SIMD& euler) const
-{
-	vectorSIMDf sqr = *reinterpret_cast<const vectorSIMDf*>(this);
-	sqr *= sqr;
-	const double test = 2.0 * (Y*W - X*Z);
-
-	if (core::equals(test, 1.0, 0.000001))
-	{
-		// heading = rotation about z-axis
-		euler.Z = (float) (-2.0*atan2(X, W));
-		// bank = rotation about x-axis
-		euler.X = 0;
-		// attitude = rotation about y-axis
-		euler.Y = core::HALF_PI<float>();
-	}
-	else if (core::equals(test, -1.0, 0.000001))
-	{
-		// heading = rotation about z-axis
-		euler.Z = (float) (2.0*atan2(X, W));
-		// bank = rotation about x-axis
-		euler.X = 0;
-		// attitude = rotation about y-axis
-		euler.Y = -core::HALF_PI<float>();
-	}
-	else
-	{
-		// heading = rotation about z-axis
-		euler.Z = (float) atan2(2.0 * (X*Y +Z*W),(sqr.X - sqr.Y - sqr.Z + sqr.W));
-		// bank = rotation about x-axis
-		euler.X = (float) atan2(2.0 * (Y*Z +X*W),(-sqr.X - sqr.Y + sqr.Z + sqr.W));
-		// attitude = rotation about y-axis
-		euler.Y = (float) asin( core::clamp(test, -1.0, 1.0) );
-	}
-}
-
-inline quaternion quaternion::rotationFromTo(const vector3df_SIMD& from, const vector3df_SIMD& to)
-{
-	// Based on Stan Melax's article in Game Programming Gems
-	// Copy, since cannot modify local
-	vector3df_SIMD v0 = from;
-	vector3df_SIMD v1 = to;
-	v0 = core::normalize(v0);
-	v1 = core::normalize(v1);
-
-	const vectorSIMDf dddd = core::dot(v0,v1);
-	quaternion tmp;
-	if (dddd.X >= 1.0f) // If dot == 1, vectors are the same
-	{
-		return tmp;
-	}
-	else if (dddd.X <= -1.0f) // exactly opposite
-	{
-		vector3df_SIMD axis(1.0f, 0.f, 0.f);
-		axis = cross(axis,v0);
-		if (length(axis)[0]==0.f)
-		{
-			axis.set(0.f,1.f,0.f);
-			axis = cross(axis,v0);
-		}
-		// same as fromAngleAxis(PI, axis).normalize();
-		reinterpret_cast<vectorSIMDf&>(tmp) = axis;
-		return normalize(tmp);
-	}
-
-    vectorSIMDf s = core::sqrt(vectorSIMDf(2.f,2.f,2.f,0.f)+dddd*2.f);
-	reinterpret_cast<vectorSIMDf&>(tmp) = cross(v0,v1)*reciprocal_approxim(s);
-	tmp.W = s.X*0.5f;
-    return normalize(tmp);
-}
-#endif
-
-// sets new quaternion based on euler angles
-inline quaternion& quaternion::set(const float& roll, const float& pitch, const float& yaw)
-{
-	float angle;
-
-	angle = roll * 0.5f;
-	const float sr = sinf(angle);
-	const float cr = cosf(angle);
-
-	angle = pitch * 0.5f;
-	const float sp = sinf(angle);
-	const float cp = cos(angle);
-
-	angle = yaw * 0.5f;
-	const float sy = sinf(angle);
-	const float cy = cosf(angle);
-
-	const float cpcy = cp * cy;
-	const float spcy = sp * cy;
-	const float cpsy = cp * sy;
-	const float spsy = sp * sy;
-
-    *reinterpret_cast<vectorSIMDf*>(this) = vectorSIMDf(sr,cr,cr,cr)*vectorSIMDf(cpcy,spcy,cpsy,cpcy)+vectorSIMDf(-cr,sr,-sr,sr)*vectorSIMDf(spsy,cpsy,spcy,spsy);
-
-	return *this;
-}
-
-} // end namespace core
-} // end namespace nbl
-
-#endif
-
diff --git a/include/vector3d.h b/include/vector3d.h
index 327eaacdb2..e5a4905da3 100644
--- a/include/vector3d.h
+++ b/include/vector3d.h
@@ -6,7 +6,7 @@
 #ifndef __NBL_POINT_3D_H_INCLUDED__
 #define __NBL_POINT_3D_H_INCLUDED__
 
-#include "nbl/core/math/glslFunctions.h"
+#include <nbl/core/math/glslFunctions.h>
 
 namespace nbl
 {
@@ -63,7 +63,7 @@ namespace core
 		//! use weak float compare
 		bool operator==(const vector3d<T>& other) const
 		{
-			return core::equals<vector3d<T> >(*this,other,vector3d<T>(core::ROUNDING_ERROR<T>()));
+			return core::equals<vector3d<T> >(*this, other, vector3d<T>(core::ROUNDING_ERROR<T>()));
 		}
 
 		bool operator!=(const vector3d<T>& other) const
@@ -72,7 +72,6 @@ namespace core
 		}
 
 		// functions
-
 		vector3d<T>& set(const T nx, const T ny, const T nz) {X=nx; Y=ny; Z=nz; return *this;}
 		vector3d<T>& set(const vector3d<T>& p) {X=p.X; Y=p.Y; Z=p.Z;return *this;}
 
@@ -275,7 +274,6 @@ namespace core
 	template<class S, class T>
 	vector3d<T> operator*(const S scalar, const vector3d<T>& vector) { return vector*scalar; }
 
-
 } // end namespace core
 } // end namespace nbl
 
diff --git a/include/vectorSIMD.h b/include/vectorSIMD.h
index 4c9c90d236..7d0dc8d966 100644
--- a/include/vectorSIMD.h
+++ b/include/vectorSIMD.h
@@ -887,6 +887,24 @@ namespace core
 		}
 	};
 
+	inline hlsl::float32_t4 getAsVec4(const vectorSIMDf& vec)
+	{
+		hlsl::float32_t4 output;
+		for (int i = 0; i < 4; ++i)
+			output[i] = vec[i];
+
+		return output;
+	}
+
+	inline hlsl::float32_t3 getAsVec3(const vectorSIMDf& vec)
+	{
+		hlsl::float32_t3 output;
+		for (int i = 0; i < 3; ++i)
+			output[i] = vec[i];
+
+		return output;
+	}
+
 } // end namespace core
 } // end namespace nbl
 
diff --git a/src/nbl/asset/IAssetManager.cpp b/src/nbl/asset/IAssetManager.cpp
index a6ec07a010..545015cdaf 100644
--- a/src/nbl/asset/IAssetManager.cpp
+++ b/src/nbl/asset/IAssetManager.cpp
@@ -297,7 +297,7 @@ void IAssetManager::insertBuiltinAssets()
     addBuiltInToCaches(ds3Layout, "nbl/builtin/material/lambertian/singletexture/descriptor_set_layout/3"); // TODO find everything what has been using it so far
 
 	constexpr uint32_t pcCount = 1u;
-	asset::SPushConstantRange pcRanges[pcCount] = {asset::IShader::E_SHADER_STAGE::ESS_VERTEX,0u,sizeof(core::matrix4SIMD)};
+	asset::SPushConstantRange pcRanges[pcCount] = {asset::IShader::E_SHADER_STAGE::ESS_VERTEX,0u,sizeof(hlsl::float32_t3x4)};
 	auto pLayout = core::make_smart_refctd_ptr<asset::ICPUPipelineLayout>(
 			std::span<const asset::SPushConstantRange>(pcRanges,pcCount),
 			nullptr,core::smart_refctd_ptr(ds1Layout),nullptr,core::smart_refctd_ptr(ds3Layout)
diff --git a/src/nbl/asset/interchange/CGLTFLoader.cpp b/src/nbl/asset/interchange/CGLTFLoader.cpp
index fde9552179..75f5578043 100644
--- a/src/nbl/asset/interchange/CGLTFLoader.cpp
+++ b/src/nbl/asset/interchange/CGLTFLoader.cpp
@@ -2398,7 +2398,7 @@ using namespace nbl::asset;
 							{
 								core::vector3df_SIMD translation = {};							//!< The node's translation along the x, y, and z axes.
 								core::vector3df_SIMD scale = core::vector3df_SIMD(1.f,1.f,1.f);	//!< The node's non-uniform scale, given as the scaling factors along the x, y, and z axes.
-								core::quaternion rotation = {};									//!< The node's unit quaternion rotation in the order (x, y, z, w), where w is the scalar.
+								hlsl::quaternion<float> rotation = {};									//!< The node's unit quaternion rotation in the order (x, y, z, w), where w is the scalar.
 							} trs;
 
 							if (translation.error() != simdjson::error_code::NO_SUCH_FIELD)
diff --git a/src/nbl/asset/utils/CGeometryCreator.cpp b/src/nbl/asset/utils/CGeometryCreator.cpp
index ea18b4bf2b..9142aa2aeb 100644
--- a/src/nbl/asset/utils/CGeometryCreator.cpp
+++ b/src/nbl/asset/utils/CGeometryCreator.cpp
@@ -680,7 +680,7 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin
 	DiskVertex* ptr = (DiskVertex*)vertices->getPointer();
 
 	const core::vectorSIMDf v0(0.0f, radius, 0.0f, 1.0f);
-	core::matrix3x4SIMD rotation;
+	hlsl::float32_t3x4 rotation;
 
 	//center
 	ptr[0] = DiskVertex(core::vector3df_SIMD(0.0f), video::SColor(0xFFFFFFFFu),
@@ -696,10 +696,9 @@ CGeometryCreator::return_type CGeometryCreator::createDiskMesh(float radius, uin
 	//v1, v2, ..., vn-1
 	for (int i = 2; i < vertexCount-1; i++)
 	{
-		core::vectorSIMDf vn;
-		core::matrix3x4SIMD rotMatrix;
-		rotMatrix.setRotation(core::quaternion(0.0f, 0.0f, core::radians((i-1)*angle)));
-		rotMatrix.transformVect(vn, v0);
+		hlsl::float32_t3x4 rotMatrix;
+		hlsl::setRotation<hlsl::float32_t, 3>(rotMatrix, hlsl::quaternion<hlsl::float32_t>::create(0.0f, 0.0f, core::radians((i - 1) * angle)));
+		core::vectorSIMDf vn = hlsl::transformVector<hlsl::float32_t>(hlsl::getMatrix3x4As4x4<hlsl::float32_t>(rotMatrix), v0);
 
 		ptr[i] = DiskVertex(vn, video::SColor(0xFFFFFFFFu),
 			core::vector2du32_SIMD(0u, 1u), core::vector3df_SIMD(0.0f, 0.0f, 1.0f));
diff --git a/src/nbl/asset/utils/CMeshManipulator.cpp b/src/nbl/asset/utils/CMeshManipulator.cpp
index d06762bf86..859c1859c1 100644
--- a/src/nbl/asset/utils/CMeshManipulator.cpp
+++ b/src/nbl/asset/utils/CMeshManipulator.cpp
@@ -9,7 +9,6 @@
 #include <unordered_map>
 #include <unordered_set>
 
-
 #include "nbl/asset/asset.h"
 #include "nbl/asset/IRenderpassIndependentPipeline.h"
 #include "nbl/asset/utils/CMeshManipulator.h"
@@ -1526,7 +1525,7 @@ float IMeshManipulator::DistanceToPlane(core::vectorSIMDf InPoint, core::vectorS
     return (core::dot(PointToPlane, PlaneNormal).x >= 0) ? core::abs(core::dot(PointToPlane, PlaneNormal).x) : 0;
 }
 
-core::matrix3x4SIMD IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer) 
+hlsl::float32_t3x4 IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuffer* meshbuffer) 
 {
     auto FindMinMaxProj = [&](const core::vectorSIMDf& Dir, const core::vectorSIMDf Extrema[]) -> core::vectorSIMDf
     {
@@ -1682,7 +1681,7 @@ core::matrix3x4SIMD IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuf
     ComputeAxis(P2, P1, Q1, BestAxis, BestQuality, Extrema);
     ComputeAxis(Q1, P2, P1, BestAxis, BestQuality, Extrema);
 
-    core::matrix3x4SIMD TransMat = core::matrix3x4SIMD(
+    hlsl::float32_t3x4 TransMat = hlsl::float32_t3x4(
         BestAxis[0].x, BestAxis[1].x, BestAxis[2].x, 0,
         BestAxis[0].y, BestAxis[1].y, BestAxis[2].y, 0,
         BestAxis[0].z, BestAxis[1].z, BestAxis[2].z, 0);
@@ -1705,21 +1704,22 @@ core::matrix3x4SIMD IMeshManipulator::calculateOBB(const nbl::asset::ICPUMeshBuf
 
     core::vectorSIMDf ABBDiff = AABBMax - AABBMin;
     float ABBQuality = ABBDiff.x * ABBDiff.y + ABBDiff.y * ABBDiff.z + ABBDiff.z * ABBDiff.x;
-    core::matrix3x4SIMD scaleMat;
-    core::matrix3x4SIMD translationMat;
-    translationMat.setTranslation(-(MinPoint) / OBBDiff);
-    scaleMat.setScale(OBBDiff);
-    TransMat = core::concatenateBFollowedByA(TransMat, scaleMat);
-    TransMat = core::concatenateBFollowedByA(TransMat, translationMat);
-    if (ABBQuality < OBBQuality) {
-        translationMat.setTranslation(-(AABBMin) / ABBDiff);
-        scaleMat.setScale(ABBDiff);
-        TransMat = core::matrix3x4SIMD(
+    hlsl::float32_t3x4 scaleMat;
+    hlsl::float32_t3x4 translationMat;
+    hlsl::setTranslation<hlsl::float32_t, 3>(translationMat, core::getAsVec3(-(MinPoint) / OBBDiff));
+    hlsl::setScale<hlsl::float32_t, 3>(scaleMat, core::getAsVec3(OBBDiff));
+    TransMat = hlsl::concatenateBFollowedByA<hlsl::float32_t>(TransMat, scaleMat);
+    TransMat = hlsl::concatenateBFollowedByA<hlsl::float32_t>(TransMat, translationMat);
+    if (ABBQuality < OBBQuality)
+    {
+        hlsl::setTranslation<hlsl::float32_t, 3>(translationMat, core::getAsVec3(-(AABBMin) / ABBDiff));
+        hlsl::setScale<hlsl::float32_t, 3>(scaleMat, core::getAsVec3(ABBDiff));
+        TransMat = hlsl::float32_t3x4(
             1, 0, 0, 0,
             0, 1, 0, 0,
             0, 0, 1, 0);
-        TransMat = core::concatenateBFollowedByA(TransMat, scaleMat);
-        TransMat = core::concatenateBFollowedByA(TransMat, translationMat);
+        TransMat = hlsl::concatenateBFollowedByA<hlsl::float32_t>(TransMat, scaleMat);
+        TransMat = hlsl::concatenateBFollowedByA<hlsl::float32_t>(TransMat, translationMat);
     }
 
     return TransMat;
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 227f9780ff..659c3a1c3b 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -223,10 +223,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/portable/matrix_t.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ieee754/impl.hlsl")
 # utility
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/array_accessors.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/matrix_utils/matrix_traits.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/vector_utils/vector_traits.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/matrix_utils/matrix_traits.hlsl")
 
 #spirv intrinsics
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/spirv_intrinsics/core.hlsl")
@@ -284,6 +283,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/format.hlsl")
 #linear algebra
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/linalg/fast_affine.hlsl")
 # TODO: rename `equations` to `polynomials` probably
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/functions.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/geometry.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/intutil.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quadratic.hlsl")
@@ -345,15 +345,19 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory_accessor.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/enums.hlsl")
 #
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/binding_info.hlsl")
-#
+#concepts
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/core.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/matrix.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/vector.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/__end.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/anisotropically_sampled.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_image.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl")
-
-# temporary (delete once replaced)
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bitreverse.hlsl")
+#tgmath
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/impl.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/output_structs.hlsl")
 
 ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL")
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 3d6c85cd74..cb8cb3e45b 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -1,5 +1,6 @@
 #include "nbl/video/utilities/CComputeBlit.h"
 #include "nbl/builtin/hlsl/binding_info.hlsl"
+#include "nbl/builtin/hlsl/tgmath.hlsl"
 
 using namespace nbl::core;
 using namespace nbl::hlsl;