From de3f8858f0433bd2eab1bdcdc5a178dd3fd7ccbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Wed, 23 Oct 2019 12:32:12 +0200
Subject: [PATCH] Math: now that we have a standalone cofactor(), inline its
 internals.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apparently this gives a nearly three-times speed up compared to before.
Didn't expect that.

Starting Magnum::Math::Test::MatrixBenchmark with 16 test cases...
  INFO Benchmarking a debug build.
 BENCH [01]  95.33 ± 4.52   ns multiply3()@499x10000 (wall time)
 BENCH [02] 183.99 ± 9.29   ns multiply4()@499x10000 (wall time)
 BENCH [03] 110.17 ± 8.50   ns comatrix3()@49x10000 (wall time)
 BENCH [04] 161.54 ± 10.13  ns invert3()@49x10000 (wall time)
 BENCH [05] 471.44 ± 19.40  ns invert3GaussJordan()@49x10000 (wall time)
 BENCH [06] 320.65 ± 13.23  ns invert3Rigid()@49x10000 (wall time)
 BENCH [07] 206.27 ± 9.80   ns invert3Orthogonal()@49x10000 (wall time)
 BENCH [08] 321.25 ± 18.82  ns comatrix4()@49x10000 (wall time)
 BENCH [09] 445.50 ± 15.18  ns invert4()@49x10000 (wall time)
 BENCH [10] 828.55 ± 16.96  ns invert4GaussJordan()@49x10000 (wall time)
 BENCH [11] 533.23 ± 21.75  ns invert4Rigid()@49x10000 (wall time)
 BENCH [12] 345.56 ± 10.16  ns invert4Orthogonal()@49x10000 (wall time)
 BENCH [13]  63.72 ± 6.85   ns transformVector3()@999x10000 (wall time)
 BENCH [14]  62.28 ± 4.43   ns transformPoint3()@999x10000 (wall time)
 BENCH [15]  82.05 ± 7.96   ns transformVector4()@999x10000 (wall time)
 BENCH [16]  79.32 ± 2.41   ns transformPoint4()@999x10000 (wall time)
Finished Magnum::Math::Test::MatrixBenchmark with 0 errors out of 5500 checks.
---
 src/Magnum/Math/Matrix.h | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/src/Magnum/Math/Matrix.h b/src/Magnum/Math/Matrix.h
index 103a78264..2a247c60e 100644
--- a/src/Magnum/Math/Matrix.h
+++ b/src/Magnum/Math/Matrix.h
@@ -372,6 +372,10 @@ template<std::size_t size, class T> struct MatrixDeterminant {
 
         return out;
     }
+
+    T operator()(const Matrix<size + 1, T>& m, const std::size_t skipCol, const std::size_t skipRow) {
+        return m.ij(skipCol, skipRow).determinant();
+    }
 };
 
 /* This is not *critically* needed here (the specializations for 2x2 and 1x1
@@ -386,6 +390,19 @@ template<class T> struct MatrixDeterminant<3, T> {
             m._data[0]._data[1]*(m._data[1]._data[0]*m._data[2]._data[2] - m._data[2]._data[0]*m._data[1]._data[2]) +
             m._data[0]._data[2]*(m._data[1]._data[0]*m._data[2]._data[1] - m._data[2]._data[0]*m._data[1]._data[1]);
     }
+
+    /* Used internally by cofactor(), basically just an inlined variant of
+       ij(skipCol, skipRow).determinant() */
+    constexpr T operator()(const Matrix<4, T>& m, const std::size_t skipCol, const std::size_t skipRow) const {
+        #define _col(i) _data[i + (i >= skipCol)]
+        #define _row(i) _data[i + (i >= skipRow)]
+        return
+            m._col(0)._row(0)*((m._col(1)._row(1)*m._col(2)._row(2)) - (m._col(2)._row(1)*m._col(1)._row(2))) -
+            m._col(0)._row(1)*(m._col(1)._row(0)*m._col(2)._row(2) - m._col(2)._row(0)*m._col(1)._row(2)) +
+            m._col(0)._row(2)*(m._col(1)._row(0)*m._col(2)._row(1) - m._col(2)._row(0)*m._col(1)._row(1));
+        #undef _col
+        #undef _row
+    }
 };
 
 template<class T> struct MatrixDeterminant<2, T> {
@@ -394,6 +411,16 @@ template<class T> struct MatrixDeterminant<2, T> {
            on debug builds (saves a lot, yet doesn't obfuscate too much) */
         return m._data[0]._data[0]*m._data[1]._data[1] - m._data[1]._data[0]*m._data[0]._data[1];
     }
+
+    /* Used internally by cofactor(), basically just an inlined variant of
+       ij(skipCol, skipRow).determinant() */
+    constexpr T operator()(const Matrix<3, T>& m, const std::size_t skipCol, const std::size_t skipRow) const {
+        #define _col(i) _data[i + (i >= skipCol)]
+        #define _row(i) _data[i + (i >= skipRow)]
+        return m._col(0)._row(0)*m._col(1)._row(1) - m._col(1)._row(0)*m._col(0)._row(1);
+        #undef _col
+        #undef _row
+    }
 };
 
 template<class T> struct MatrixDeterminant<1, T> {
@@ -402,6 +429,12 @@ template<class T> struct MatrixDeterminant<1, T> {
            on debug builds (saves a lot, yet doesn't obfuscate too much) */
         return m._data[0]._data[0];
     }
+
+    /* Used internally by cofactor(), basically just an inlined variant of
+       ij(skipCol, skipRow).determinant() */
+    constexpr T operator()(const Matrix<2, T>& m, const std::size_t skipCol, const std::size_t skipRow) const {
+        return m._data[0 + (0 >= skipCol)]._data[0 + (0 >= skipRow)];
+    }
 };
 
 template<std::size_t size, class T> struct StrictWeakOrdering<Matrix<size, T>>: StrictWeakOrdering<RectangularMatrix<size, size, T>> {};
@@ -441,7 +474,7 @@ template<std::size_t size, class T> Matrix<size-1, T> Matrix<size, T>::ij(const
 }
 
 template<std::size_t size, class T> T Matrix<size, T>::cofactor(std::size_t col, std::size_t row) const {
-    return (((row+col) & 1) ? -1 : 1)*ij(col, row).determinant();
+    return (((row+col) & 1) ? -1 : 1)*Implementation::MatrixDeterminant<size - 1, T>()(*this, col, row);
 }
 
 template<std::size_t size, class T> Matrix<size, T> Matrix<size, T>::comatrix() const {