Code:
HEADER FILE!!
class mat4 {
public:
union {
struct {
float _11, _12, _13, _14;
float _21, _22, _23, _24;
float _31, _32, _33, _34;
float _41, _42, _43, _44;
};
float d[16];
float data[4][4];
__m128 rows[4];
struct {
__m128 row0, row1, row2, row3;
};
};
mat4() {}// do nothing... faster this way.. mm kay?
mat4(float m_11, float m_12, float m_13, float m_14,
float m_21, float m_22, float m_23, float m_24,
float m_31, float m_32, float m_33, float m_34,
float m_41, float m_42, float m_43, float m_44):
_11(m_11), _12(m_12), _13(m_13), _14(m_14),
_21(m_21), _22(m_22), _23(m_23), _24(m_24),
_31(m_31), _32(m_32), _33(m_33), _34(m_34),
_41(m_41), _42(m_42), _43(m_43), _44(m_44) {}
mat4(__m128& nrow0, __m128& nrow1, __m128& nrow2, __m128& nrow3): row0(nrow0), row1(nrow1), row2(nrow2), row3(nrow3) {}
mat4(const mat4& obj) : row0(obj.row0), row1(obj.row1), row2(obj.row2), row3(obj.row3) {}
void zeroTranslation();
void identity();
void setupTranslation(const vec3 &d);
void setupTranslation(float x, float y, float z);
void setTranslation(float x, float y, float z);
void setTranslation(const vec3 &d);
void setupRotateX(float theta);
void setupRotateY(float theta);
void setupRotateZ(float theta);
void setupRotate(const vec3 &axis, float theta);
void setupLookAt(const vec3& eye, const vec3& at, const vec3& up);
void setupOrtho(float w, float h, float zn, float zf);
void setupProject(float fovY, float aspect, float zn, float zf);
void fromquat(const quat &q);
void setupScale(const vec3 &s);
void setupScale(float x, float y, float z);
void setupScaleAlongAxis(const vec3 &axis, float k) ;
void setupShearX(float s, float t);
void setupShearY(float s, float t);
void setupShearZ(float s, float t);
void setupReflectX(float k) ;
void setupReflectY(float k) ;
void setupReflectZ(float k) ;
void setupReflect(const vec3 &n);
void setup(const euler &orientation);
void inverse();
mat4 operator-(const mat4 &b);
mat4& operator-=(const mat4 &m);
mat4 operator+(const mat4 &b);
mat4& operator+=(const mat4 &m);
mat4 operator*(const float &b);
mat4 operator*(const mat4 &b);
mat4& operator*=(const mat4 &m);
void operator=(const mat4& rhs);
float& operator[](uint32_t i){ return d[i]; }
float determinant();
};
mat4 operator*(const mat4& left, const mat4 &b);
vec3 operator*(const vec3 &p, const mat4 &m);
vec4 operator*(const vec4 &p, const mat4 &m);// the w component of the vec4 is ignored
vec4 operator*=( vec4 &p, const mat4 &m);// the w component of the vec4 is ignored
vec3 operator*(const mat4 &m, const vec3 &p);
vec3 &operator*=(vec3 &p, const mat4 &m);
CPP FILE!!!
void mat4::setup(const euler &orientation) {
float sh,ch, sp,cp, sb,cb;
sincos32(&sh, &ch, orientation.heading);
sincos32(&sp, &cp, orientation.pitch);
sincos32(&sb, &cb, orientation.bank);
_11 = ch * cb + sh * sp * sb;
_12 = -ch * sb + sh * sp * cb;
_13 = sh * cp;
_21 = sb * cp;
_22 = cb * cp;
_23 = -sp;
_31 = -sh * cb + ch * sp * sb;
_32 = sb * sh + ch * sp * cb;
_33 = ch * cp;
}
void mat4::fromquat(const quat &q) {// Setup the matrix to perform a rotation, given the angular displacement in quaternion form. The translation portion is reset.
float ww = 2.0f * q.w;
float xx = 2.0f * q.x;
float yy = 2.0f * q.y;
float zz = 2.0f * q.z;
// Set the matrix elements. There is still a little moreopportuni_42 for optimization due to the many common subexpressions. We'll let the compiler handle that...
_11 = 1.0f - yy*q.y - zz*q.z;
_12 = xx*q.y + ww*q.z;
_13 = xx*q.z - ww*q.x;
_21 = xx*q.y - ww*q.z;
_22 = 1.0f - xx*q.x - zz*q.z;
_23 = yy*q.z + ww*q.x;
_31 = xx*q.z + ww*q.y;
_32 = yy*q.z - ww*q.x;
_33 = 1.0f - xx*q.x - yy*q.y;
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
void mat4::identity() {// backwards looking, but that is because of little endian
rows[0] = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
rows[1] = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
rows[2] = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
rows[3] = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
}
void mat4::zeroTranslation() { row3 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); }
void mat4::setupTranslation(const vec3 &d) {
rows[0] = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
rows[1] = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
rows[2] = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
rows[3] = _mm_set_ps(1.0f, d.z, d.y, d.x);
}
void mat4::setupTranslation(float x, float y, float z) { setupTranslation(vec3(x, y, z)); }
void mat4::setTranslation(const vec3 &d) { row3=_mm_set_ps(1.0f, d.z, d.y, d.x); }
void mat4::setTranslation(float x, float y, float z){ setTranslation(vec3(x, y, z)); }
// The axis of rotation is specified using a 1-based index: 1 => rotate about the x-axis 2 => rotate about the y-axis 3 => rotate about the z-axis
// theta is the amount of rotation, in radians. The left-hand rule is used to define "positive" rotation. The translation portion is reset.
void mat4::setupRotateX(float theta) {
float s, c;
sincos32(&s, &c, theta);// Get sin and cosine of rotation angle
row0 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
row1 = _mm_set_ps(0.0f, s, c, 0.0f);
row2 = _mm_set_ps(0.0f, c, -s, 0.0f);
row3 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
}
void mat4::setupRotateY(float theta) {
float s, c;
sincos32(&s, &c, theta);// Get sin and cosine of rotation angle
row0 = _mm_set_ps(c, 0.0f, -s, 0.0f);
row1 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
row2 = _mm_set_ps(s, 0.0f, c, 0.0f);
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
void mat4::setupRotateZ(float theta) {
float s, c;
sincos32(&s, &c, theta);// Get sin and cosine of rotation angle
row0 = _mm_set_ps(c, s, 0.0f, 0.0f);
row1 = _mm_set_ps(-s, c, 0.0f, 0.0f);
row2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
// Setup the matrix to perform a rotation about an arbitrary axis. The axis of rotation must pass through the origin. axis defines the axis of rotation, and must be a unit vector.
// theta is the amount of rotation, in radians. The left-hand rule is used to define "positive" rotation. The translation portion is reset.
void mat4::setupRotate(const vec3 &axis, float theta) {
float s, c;
sincos32(&s, &c, theta);// Get sin and cosine of rotation angle
vec4 aaxis((1.0f - c) * axis), saxis(s *axis); // Compute 1 - cos32(theta) and some common subexpressions
// Set the matrix elements. There is still a little more opportuni_42 for optimization due to the many common subexpressions. We'll let the compiler handle that...
__m128 temp(_mm_set_ps1(aaxis.x));
row0 = _mm_mul_ps(temp, aaxis.vec);
temp = _mm_set_ps1(aaxis.y);
row1 = _mm_mul_ps(temp, aaxis.vec);
temp = _mm_set_ps1(aaxis.z);
row2 = _mm_mul_ps(temp, aaxis.vec);
_11 += c;
_12 += saxis.z;
_13 -= saxis.y;
_21 -= saxis.z;
_22 += c;
_23 += saxis.x;
_31 += saxis.y;
_32 -= saxis.x;
_33 += c;
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
void mat4::setupScale(const vec3 &s) {// Setup the matrix to perform scale on each axis. For uniform scale by k, use a vector of the form vec3(k,k,k)
row0 = _mm_set_ps(0.0f, 0.0f, 0.0f, s.x);
row1 = _mm_set_ps(0.0f, 0.0f, s.y, 0.0f);
row2 = _mm_set_ps(0.0f, s.z, 0.0f, 0.0f);
row3 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
}
void mat4::setupScale(float x, float y, float z){ setupScale(vec3(x, y, z)); }
void mat4::setupScaleAlongAxis(const vec3 &axis, float k) {// Setup the matrix to perform scale along an arbitrary axis. The axis is specified using a unit vector.
float a = k - 1.0f;
float ax = a * axis.x;
float ay = a * axis.y;
float az = a * axis.z;
// Fill in the matrix elements. We'll do the common subexpression optimization ourselves here, since diagonallyopposite matrix elements are equal
_11 = ax*axis.x + 1.0f;
_22 = ay*axis.y + 1.0f;
_32 = az*axis.z + 1.0f;
_12 = _21 = ax*axis.y;
_13 = _31 = ax*axis.z;
_23 = _32 = ay*axis.z;
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
// The _42pe of shear is specified by the 1-based "axis" index. The effect of transforming a point by the matrix is described by the pseudocode
void mat4::setupShearX(float s, float t) {
row0 = _mm_set_ps(1.0f, s, t, 0.0f);
row1 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
row2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
void mat4::setupShearY(float s, float t) {
row0 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
row1 = _mm_set_ps(s, 1.0f, t, 0.0f);
row2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
void mat4::setupShearZ(float s, float t) {
row0 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
row1 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
row2 = _mm_set_ps(s, t,1.0f, 0.0f);
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
void mat4::setupLookAt(const vec3& eye, const vec3& at, const vec3& up){
vec3 zaxis=at - eye;
zaxis.normalize();
vec3 xaxis=Cross(up, zaxis);
xaxis.normalize();
vec3 yaxis=Cross(zaxis, xaxis);
row0 = _mm_set_ps( 0.0f, zaxis.x, yaxis.x, xaxis.x);
row1 = _mm_set_ps( 0.0f, zaxis.y, yaxis.y, xaxis.y);
row2 = _mm_set_ps( 0.0f, zaxis.z, yaxis.z, xaxis.z);
row3 = _mm_set_ps( 1.0f, -Dot(zaxis, eye), -Dot(yaxis, eye), -Dot(xaxis, eye));
}
void mat4::setupProject(float fovY, float aspect, float zn, float zf) {
float yScale(1.0f/tan32(fovY*0.5f));
float invfmn(1.0f/(zf-zn));
row0 = _mm_set_ps(0.0f, 0.0f, 0.0f, yScale/aspect);
row1 = _mm_set_ps(0.0f, 0.0f, yScale, 0.0f);
row2 = _mm_set_ps(1.0f, zf*invfmn, 0.0f, 0.0f );
row3 = _mm_set_ps(0.0f, -zn*zf*invfmn, 0.0f, 0.0f);
}
void mat4::setupOrtho(float w, float h, float zn, float zf){
row0 = _mm_set_ps(0.0f, 0.0f, 0.0f, 2.0f/w);
row1 = _mm_set_ps(0.0f, 0.0f, 2.0f/h, 0.0f);
row2 = _mm_set_ps(0.0f, 1.0f/(zf-zn), 0.0f, 0.0f);
row3 = _mm_set_ps(1.0f, zn/(zn-zf), 0.0f, 0.0f);
}
// Setup the matrix to perform a reflection about a plane parallel to a cardinal plane.
void mat4::setupReflectX(float k) {
_11 = -1.0f; _12 = 0.0f; _13 = 0.0f;
_21 = 0.0f; _22 = 1.0f; _23 = 0.0f;
_31 = 0.0f; _32 = 0.0f; _33 = 1.0f;
_41 = 2.0f * k;
_42 = 0.0f;
_43 = 0.0f;
}
void mat4::setupReflectY(float k) {
_11 = 1.0f; _12 = 0.0f; _13 = 0.0f;
_21 = 0.0f; _22 = -1.0f; _23 = 0.0f;
_31 = 0.0f; _32 = 0.0f; _33 = 1.0f;
_41 = 0.0f;
_42 = 2.0f * k;
_43 = 0.0f;
}
void mat4::setupReflectZ(float k) {
_11 = 1.0f; _12 = 0.0f; _13 = 0.0f;
_21 = 0.0f; _22 = 1.0f; _23 = 0.0f;
_31 = 0.0f; _32 = 0.0f; _33 = -1.0f;
_41 = 0.0f;
_42 = 0.0f;
_43 = 2.0f * k;
}
void mat4::setupReflect(const vec3 &n) {// Setup the matrix to perform a reflection about an arbitrary plane through the origin. The unit vector n is perpendicular to the plane. The translation portion is reset.
float ax = -2.0f * n.x;
float ay = -2.0f * n.y;
float az = -2.0f * n.z;
// Fill in the matrix elements. We'll do the common subexpression optimization ourselves here, since diagonally opposite matrix elements are equal
_11 = 1.0f + ax*n.x;
_22 = 1.0f + ay*n.y;
_32 = 1.0f + az*n.z;
_12 = _21 = ax*n.y;
_13 = _31 = ax*n.z;
_23 = _32 = ay*n.z;
row3 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
mat4 operator*(const mat4& b, const mat4 &left){
__m128 r0;
{
__m128 e0 = _mm_shuffle_ps(b.row0, b.row0, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(b.row0, b.row0, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(b.row0, b.row0, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(b.row0, b.row0, _MM_SHUFFLE(3, 3, 3, 3));
r0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(left.row0, e0), _mm_mul_ps(left.row1, e1)), _mm_add_ps(_mm_mul_ps(left.row2, e2), _mm_mul_ps(left.row3, e3)));
}
__m128 r1;
{
__m128 e0 = _mm_shuffle_ps(b.row1, b.row1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(b.row1, b.row1, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(b.row1, b.row1, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(b.row1, b.row1, _MM_SHUFFLE(3, 3, 3, 3));
r1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(left.row0, e0), _mm_mul_ps(left.row1, e1)), _mm_add_ps(_mm_mul_ps(left.row2, e2), _mm_mul_ps(left.row3, e3)));
}
__m128 r2;
{
__m128 e0 = _mm_shuffle_ps(b.row2, b.row2, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(b.row2, b.row2, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(b.row2, b.row2, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(b.row2, b.row2, _MM_SHUFFLE(3, 3, 3, 3));
r2 =_mm_add_ps(_mm_add_ps(_mm_mul_ps(left.row0, e0), _mm_mul_ps(left.row1, e1)), _mm_add_ps(_mm_mul_ps(left.row2, e2), _mm_mul_ps(left.row3, e3)));
}
__m128 r3;
{
__m128 e0 = _mm_shuffle_ps(b.row3, b.row3, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(b.row3, b.row3, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(b.row3, b.row3, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(b.row3, b.row3, _MM_SHUFFLE(3, 3, 3, 3));
r3 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(left.row0, e0), _mm_mul_ps(left.row1, e1)), _mm_add_ps(_mm_mul_ps(left.row2, e2), _mm_mul_ps(left.row3, e3)));
}
return mat4(r0, r1, r2, r3);
}
void mat4::operator=(const mat4& rhs){
row0 = rhs.row0;
row1 = rhs.row1;
row2 = rhs.row2;
row3 = rhs.row3;
}
vec3 operator*(const vec3 &p, const mat4 &m) {
return vec3( p.x*m._11 + p.y*m._21 + p.z*m._31 + m._41, p.x*m._12 + p.y*m._22 + p.z*m._32 + m._42, p.x*m._13 + p.y*m._23 + p.z*m._33 + m._43);
}
vec3 operator*(const mat4 &m, const vec3 &p){
return vec3( p.x*m._11 + p.y*m._21 + p.z*m._31 + m._41, p.x*m._12 + p.y*m._22 + p.z*m._32 + m._42, p.x*m._13 + p.y*m._23 + p.z*m._33 + m._43);
}
vec3 &operator*=(vec3 &p, const mat4 &m) {
p = vec3( p.x*m._11 + p.y*m._21 + p.z*m._31 + m._41, p.x*m._12 + p.y*m._22 + p.z*m._32 + m._42, p.x*m._13 + p.y*m._23 + p.z*m._33 + m._43);
return p;
}
vec4 operator*=(vec4 &p, const mat4 &m) {// the w component is ignored for this.
p = vec4( p.x*m._11 + p.y*m._21 + p.z*m._31 + m._41, p.x*m._12 + p.y*m._22 + p.z*m._32 + m._42, p.x*m._13 + p.y*m._23 + p.z*m._33 + m._43, 1.0f);
return p;
}
vec4 operator*(const vec4 &p, const mat4 &m) {// the w component is ignored for this.
return vec4( p.x*m._11 + p.y*m._21 + p.z*m._31 + m._41, p.x*m._12 + p.y*m._22 + p.z*m._32 + m._42, p.x*m._13 + p.y*m._23 + p.z*m._33 + m._43, 1.0f);
}
void TransformNormal(vec3& p, const mat4& m){
p = vec3( p.x*m._11 + p.y*m._21 + p.z*m._31, p.x*m._12 + p.y*m._22 + p.z*m._32, p.x*m._13 + p.y*m._23 + p.z*m._33);
};
mat4 mat4::operator*(const mat4 &b) {
__m128 r0;
{
__m128 e0 = _mm_shuffle_ps(row0, row0, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(row0, row0, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(row0, row0, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(row0, row0, _MM_SHUFFLE(3, 3, 3, 3));
r0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(b.row0, e0), _mm_mul_ps(b.row1, e1)), _mm_add_ps(_mm_mul_ps(b.row2, e2), _mm_mul_ps(b.row3, e3)));
}
__m128 r1;
{
__m128 e0 = _mm_shuffle_ps(row1, row1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(row1, row1, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(row1, row1, _MM_SHUFFLE(3, 3, 3, 3));
r1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(b.row0, e0), _mm_mul_ps(b.row1, e1)), _mm_add_ps(_mm_mul_ps(b.row2, e2), _mm_mul_ps(b.row3, e3)));
}
__m128 r2;
{
__m128 e0 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(3, 3, 3, 3));
r2 =_mm_add_ps(_mm_add_ps(_mm_mul_ps(b.row0, e0), _mm_mul_ps(b.row1, e1)), _mm_add_ps(_mm_mul_ps(b.row2, e2), _mm_mul_ps(b.row3, e3)));
}
__m128 r3;
{
__m128 e0 = _mm_shuffle_ps(row3, row3, _MM_SHUFFLE(0, 0, 0, 0));
__m128 e1 = _mm_shuffle_ps(row3, row3, _MM_SHUFFLE(1, 1, 1, 1));
__m128 e2 = _mm_shuffle_ps(row3, row3, _MM_SHUFFLE(2, 2, 2, 2));
__m128 e3 = _mm_shuffle_ps(row3, row3, _MM_SHUFFLE(3, 3, 3, 3));
r3 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(b.row0, e0), _mm_mul_ps(b.row1, e1)), _mm_add_ps(_mm_mul_ps(b.row2, e2), _mm_mul_ps(b.row3, e3)));
}
return mat4(r0, r1, r2, r3);
/*
return mat4(
_11*b._11 + _12*b._21 + _13*b._31 + _14*b._41,
_11*b._12 + _12*b._22 + _13*b._32 + _14*b._42,
_11*b._13 + _12*b._23 + _13*b._33 + _14*b._43,
_11*b._14 + _12*b._24 + _13*b._34 + _14*b._44,
_21*b._11 + _22*b._21 + _23*b._31 + _24*b._41,
_21*b._12 + _22*b._22 + _23*b._32 + _24*b._42,
_21*b._13 + _22*b._23 + _23*b._33 + _24*b._43,
_21*b._14 + _22*b._24 + _23*b._34 + _24*b._44,
_31*b._11 + _32*b._21 + _33*b._31 + _34*b._41,
_31*b._12 + _32*b._22 + _33*b._32 + _34*b._42,
_31*b._13 + _32*b._23 + _33*b._33 + _34*b._43,
_31*b._14 + _32*b._24 + _33*b._34 + _34*b._44,
_41*b._11 + _42*b._21 + _43*b._31 + _44*b._41,
_41*b._12 + _42*b._22 + _43*b._32 + _44*b._42,
_41*b._13 + _42*b._23 + _43*b._33 + _44*b._43,
_41*b._14 + _42*b._24 + _43*b._34 + _44*b._44
);
*/
}
mat4& mat4::operator*=(const mat4 &b) {
*this = *this * b;
return *this;
}
mat4 mat4::operator+(const mat4 &b) {
return mat4(_mm_add_ps(row0, b.row0), _mm_add_ps(row1, b.row1), _mm_add_ps(row2, b.row2), _mm_add_ps(row3, b.row3));
}
mat4& mat4::operator+=(const mat4 &b) {
row0 = _mm_add_ps(row0, b.row0);
row1 = _mm_add_ps(row1, b.row1);
row2 = _mm_add_ps(row2, b.row2);
row3 = _mm_add_ps(row3, b.row3);
return *this;
}
mat4 mat4::operator-(const mat4 &b) {
return mat4(_mm_sub_ps(row0, b.row0), _mm_sub_ps(row1, b.row1), _mm_sub_ps(row2, b.row2), _mm_sub_ps(row3, b.row3));
}
mat4& mat4::operator-=(const mat4 &b) {
row0 = _mm_sub_ps(row0, b.row0);
row1 = _mm_sub_ps(row1, b.row1);
row2 = _mm_sub_ps(row2, b.row2);
row3 = _mm_sub_ps(row3, b.row3);
return *this;
}
mat4 mat4::operator*(const float &b) {
__m128 temp(_mm_set_ps1(b));
return mat4(_mm_mul_ps(row0, temp), _mm_mul_ps(row1, temp), _mm_mul_ps(row2, temp), _mm_mul_ps(row3, temp));
}
static const __m128 one = _mm_set_ps1(1.0f);
inline __m128 _mm_dot_ps(__m128 v1, __m128 v2)
{
__m128 mul0 = _mm_mul_ps(v1, v2);
__m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
__m128 add0 = _mm_add_ps(mul0, swp0);
__m128 swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3));
__m128 add1 = _mm_add_ps(add0, swp1);
return add1;
}
void mat4::inverse() {// Compute the inverse of a matrix. We use the classical adjoint divided by the determinant method.
// If we're singular, then the determinant is zero and there's no inverse Compute one over the determinant, so we divide once and can *multiply* per element
/* /// old implimentation!
float D = _12 *_24 *_33* _41 - _12* _23* _34* _41 - _11* _24* _33* _42 + _11* _23 *_34 *_42 - _12 *_24* _31* _43 + _11* _24* _32* _43 + _12* _21 *_34* _43 - _11 *_22* _34* _43 + _14 *(_23 *_32 *_41 - _22* _33* _41 - _23* _31 *_42 + _21* _33* _42 + _22* _31* _43 - _21 *_32* _43) + (_12 *_23* _31 - _11* _23 *_32 - _12 *_21 *_33 + _11 *_22 *_33)* _44 + _13* (-_24 *_32* _41 + _22 *_34 *_41 + _24* _31* _42 - _21* _34* _42 - _22 *_31 *_44 + _21 *_32 *_44);
temp._11 = -_24 *_33* _42 + _23* _34 *_42 + _24* _32* _43 - _22* _34* _43 - _23* _32* _44 + _22* _33* _44;
temp._12 = _14 *_33* _42 - _13 *_34 *_42 - _14 *_32* _43 + _12* _34 *_43 + _13* _32* _44 - _12 *_33* _44;
temp._13 = -_14* _23* _42 + _13* _24 *_42 + _14 *_22* _43 - _12* _24* _43 - _13* _22* _44 + _12* _23 *_44;
temp._14 = _14 *_23* _32 - _13* _24* _32 - _14* _22 *_33 + _12* _24* _33 + _13* _22 *_34 - _12* _23* _34;
temp._21 = _24 *_33* _41 - _23* _34* _41 - _24 *_31* _43 + _21* _34* _43 + _23 *_31 *_44 - _21 *_33 *_44;
temp._22 = -_14 *_33 *_41 + _13 *_34* _41 + _14* _31 *_43 - _11* _34* _43 - _13* _31* _44 + _11* _33* _44;
temp._23 = _14* _23* _41 - _13 *_24* _41 - _14 *_21* _43 + _11* _24* _43 + _13* _21 *_44 - _11* _23* _44;
temp._24 = -_14 *_23 *_31 + _13* _24* _31 + _14* _21* _33 - _11* _24* _33 - _13* _21* _34 + _11* _23* _34;
temp._31 = -_24* _32* _41 + _22 *_34* _41 + _24* _31* _42 - _21 *_34* _42 - _22* _31* _44 + _21* _32 *_44;
temp._32 = _14* _32* _41 - _12 *_34 *_41 - _14* _31* _42 + _11* _34* _42 + _12 *_31 *_44 - _11 *_32 *_44;
temp._33 = -_14 *_22* _41 + _12* _24* _41 + _14* _21* _42 - _11* _24* _42 - _12 *_21* _44 + _11* _22* _44;
temp._34 = _14 *_22* _31 - _12* _24* _31 - _14* _21* _32 + _11 *_24* _32 + _12 *_21 *_34 - _11* _22 *_34;
temp._41 = _23* _32 *_41 - _22* _33 *_41 - _23* _31 *_42 + _21* _33* _42 + _22 *_31 *_43 - _21* _32* _43;
temp._42 = -_13* _32* _41 + _12* _33 *_41 + _13* _31* _42 - _11* _33* _42 - _12 *_31 *_43 + _11* _32* _43;
temp._43 = _13 *_22* _41 - _12* _23* _41 - _13 *_21* _42 + _11* _23 *_42 + _12* _21* _43 - _11* _22 *_43;
temp._44 = -_13* _22* _31 + _12* _23* _31 + _13* _21* _32 - _11 *_23 *_32 - _12* _21* _33 + _11* _22* _33;
__m128 temp1(_mm_set_ps1(1.0f / determinant()));
mat4 t;
t._11 = _23*_34*_42 - _24*_33*_42 + _24*_32*_43 - _22*_34*_43 - _23*_32*_44 + _22*_33*_44;
t._12 = _14*_33*_42 - _13*_34*_42 - _14*_32*_43 + _12*_34*_43 + _13*_32*_44 - _12*_33*_44;
t._13 = _13*_24*_42 - _14*_23*_42 + _14*_22*_43 - _12*_24*_43 - _13*_22*_44 + _12*_23*_44;
t._14 = _14*_23*_32 - _13*_24*_32 - _14*_22*_33 + _12*_24*_33 + _13*_22*_34 - _12*_23*_34;
t._21 = _24*_33*_41 - _23*_34*_41 - _24*_31*_43 + _21*_34*_43 + _23*_31*_44 - _21*_33*_44;
t._22 = _13*_34*_41 - _14*_33*_41 + _14*_31*_43 - _11*_34*_43 - _13*_31*_44 + _11*_33*_44;
t._23 = _14*_23*_41 - _13*_24*_41 - _14*_21*_43 + _11*_24*_43 + _13*_21*_44 - _11*_23*_44;
t._24 = _13*_24*_31 - _14*_23*_31 + _14*_21*_33 - _11*_24*_33 - _13*_21*_34 + _11*_23*_34;
t._31 = _22*_34*_41 - _24*_32*_41 + _24*_31*_42 - _21*_34*_42 - _22*_31*_44 + _21*_32*_44;
t._32 = _14*_32*_41 - _12*_34*_41 - _14*_31*_42 + _11*_34*_42 + _12*_31*_44 - _11*_32*_44;
t._33 = _12*_24*_41 - _14*_22*_41 + _14*_21*_42 - _11*_24*_42 - _12*_21*_44 + _11*_22*_44;
t._34 = _14*_22*_31 - _12*_24*_31 - _14*_21*_32 + _11*_24*_32 + _12*_21*_34 - _11*_22*_34;
t._41 = _23*_32*_41 - _22*_33*_41 - _23*_31*_42 + _21*_33*_42 + _22*_31*_43 - _21*_32*_43;
t._42 = _12*_33*_41 - _13*_32*_41 + _13*_31*_42 - _11*_33*_42 - _12*_31*_43 + _11*_32*_43;
t._43 = _13*_22*_41 - _12*_23*_41 - _13*_21*_42 + _11*_23*_42 + _12*_21*_43 - _11*_22*_43;
t._44 = _12*_23*_31 - _13*_22*_31 + _13*_21*_32 - _11*_23*_32 - _12*_21*_33 + _11*_22*_33;
row0 = _mm_mul_ps(t.row0, temp1);
row1 = _mm_mul_ps(t.row1, temp1);
row2 = _mm_mul_ps(t.row2, temp1);
row3 = _mm_mul_ps(t.row3, temp1);
*/
__m128 Fac0;
{
// valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
// valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
// valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
// valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp00 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(3, 3, 3, 3));
Fac0 =_mm_sub_ps(_mm_mul_ps(Swp00, Swp01), _mm_mul_ps(Swp02, Swp03));
}
__m128 Fac1;
{
// valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
// valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
// valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
// valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp00 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(3, 3, 3, 3));
Fac1 =_mm_sub_ps(_mm_mul_ps(Swp00, Swp01), _mm_mul_ps(Swp02, Swp03));
}
__m128 Fac2;
{
// valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
// valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
// valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
// valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
__m128 Swp0a = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp0b = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp00 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(2, 2, 2, 2));
Fac2 =_mm_sub_ps(_mm_mul_ps(Swp00, Swp01), _mm_mul_ps(Swp02, Swp03));
}
__m128 Fac3;
{
// valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
// valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
// valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
// valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
__m128 Swp0a = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Swp0b = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(3, 3, 3, 3));
Fac3 = _mm_sub_ps(_mm_mul_ps(Swp00, Swp01), _mm_mul_ps(Swp02, Swp03));
}
__m128 Fac4;
{
// valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
// valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
// valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
// valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
__m128 Swp0a = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Swp0b = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(2, 2, 2, 2));
Fac4 = _mm_sub_ps(_mm_mul_ps(Swp00, Swp01), _mm_mul_ps(Swp02, Swp03));
}
__m128 Fac5;
{
// valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
// valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
// valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
// valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
__m128 Swp0a = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Swp0b = _mm_shuffle_ps(row3, row2, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp00 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
__m128 Swp03 = _mm_shuffle_ps(row2, row1, _MM_SHUFFLE(1, 1, 1, 1));
Fac5 = _mm_sub_ps(_mm_mul_ps(Swp00, Swp01), _mm_mul_ps(Swp02, Swp03));
}
__m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
__m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
// m[1][0]
// m[0][0]
// m[0][0]
// m[0][0]
__m128 Temp0 = _mm_shuffle_ps(row1, row0, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][1]
// m[0][1]
// m[0][1]
// m[0][1]
__m128 Temp1 = _mm_shuffle_ps(row1, row0, _MM_SHUFFLE(1, 1, 1, 1));
__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][2]
// m[0][2]
// m[0][2]
// m[0][2]
__m128 Temp2 = _mm_shuffle_ps(row1, row0, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
// m[1][3]
// m[0][3]
// m[0][3]
// m[0][3]
__m128 Temp3 = _mm_shuffle_ps(row1, row0, _MM_SHUFFLE(3, 3, 3, 3));
__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
// col0
// + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
// - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
// + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
// - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
__m128 Sub00 = _mm_sub_ps(_mm_mul_ps(Vec1, Fac0), _mm_mul_ps(Vec2, Fac1));
__m128 Add00 = _mm_add_ps(Sub00, _mm_mul_ps(Vec3, Fac2));
__m128 Inv0 = _mm_mul_ps(SignB, Add00);
// col1
// - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
// + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
// - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
// + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
__m128 Sub01 = _mm_sub_ps(_mm_mul_ps(Vec0, Fac0), _mm_mul_ps(Vec2, Fac3));
__m128 Add01 = _mm_add_ps(Sub01, _mm_mul_ps(Vec3, Fac4));
__m128 Inv1 = _mm_mul_ps(SignA, Add01);
// col2
// + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
// - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
// + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
// - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
__m128 Sub02 = _mm_sub_ps(_mm_mul_ps(Vec0, Fac1), _mm_mul_ps(Vec1, Fac3));
__m128 Add02 = _mm_add_ps(Sub02, _mm_mul_ps(Vec3, Fac5));
__m128 Inv2 = _mm_mul_ps(SignB, Add02);
// col3
// - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
// + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
// - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
// + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
__m128 Sub03 = _mm_sub_ps(_mm_mul_ps(Vec0, Fac2), _mm_mul_ps(Vec1, Fac4));
__m128 Add03 = _mm_add_ps(Sub03, _mm_mul_ps(Vec2, Fac5));
__m128 Inv3 = _mm_mul_ps(SignA, Add03);
__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
// valType Determinant = m[0][0] * Inverse[0][0]
// + m[0][1] * Inverse[1][0]
// + m[0][2] * Inverse[2][0]
// + m[0][3] * Inverse[3][0];
__m128 Det0 = _mm_dot_ps(row0, Row2);
__m128 Rcp0 = _mm_div_ps(one, Det0);
//__m128 Rcp0 = _mm_rcp_ps(Det0);
// Inverse /= Determinant;
row0 = _mm_mul_ps(Inv0, Rcp0);
row1= _mm_mul_ps(Inv1, Rcp0);
row2 = _mm_mul_ps(Inv2, Rcp0);
row3 = _mm_mul_ps(Inv3, Rcp0);
}