Math: make cross() 10x faster in Debug.

And the Vector3 version 5% slower in Release, on GCC at least. FFS, what was I thinking with the gather() things. Nice in user code, extremely bad in library code.
6 years ago · 573125d0a9
9 changed files with 102 additions and 15 deletions
--- a/doc/snippets/MagnumMath.cpp
+++ b/doc/snippets/MagnumMath.cpp
@ -39,6 +39,7 @@
 #include "Magnum/Math/Range.h"
 #include "Magnum/Math/Algorithms/GramSchmidt.h"
 #include "Magnum/Math/StrictWeakOrdering.h"
+#include "Magnum/Math/Swizzle.h"

 using namespace Magnum;
 using namespace Magnum::Math::Literals;
--- a/doc/snippets/MagnumTrade.cpp
+++ b/doc/snippets/MagnumTrade.cpp
@ -32,8 +32,9 @@
 #include "Magnum/ImageView.h"
 #include "Magnum/Mesh.h"
 #include "Magnum/PixelFormat.h"
-#include "Magnum/MeshTools/Interleave.h"
 #include "Magnum/Animation/Player.h"
+#include "Magnum/Math/Swizzle.h"
+#include "Magnum/MeshTools/Interleave.h"
 #include "Magnum/MeshTools/Transform.h"
 #include "Magnum/Trade/AbstractImporter.h"
 #include "Magnum/Trade/AnimationData.h"
--- a/src/Magnum/Math/Test/ColorTest.cpp
+++ b/src/Magnum/Math/Test/ColorTest.cpp
@ -35,6 +35,7 @@
 #include "Magnum/Math/Color.h"
 #include "Magnum/Math/Half.h"
 #include "Magnum/Math/StrictWeakOrdering.h"
+#include "Magnum/Math/Swizzle.h"

 struct Vec3 {
    float x, y, z;
--- a/src/Magnum/Math/Test/Vector2Test.cpp
+++ b/src/Magnum/Math/Test/Vector2Test.cpp
@ -29,6 +29,7 @@

 #include "Magnum/Math/Vector3.h" /* Vector3 used in Vector2Test::cross() */
 #include "Magnum/Math/StrictWeakOrdering.h"
+#include "Magnum/Math/Swizzle.h"

 struct Vec2 {
    float x, y;
--- a/src/Magnum/Math/Test/Vector3Test.cpp
+++ b/src/Magnum/Math/Test/Vector3Test.cpp
@ -29,6 +29,7 @@

 #include "Magnum/Math/Vector3.h"
 #include "Magnum/Math/StrictWeakOrdering.h"
+#include "Magnum/Math/Swizzle.h"

 struct Vec3 {
    float x, y, z;
--- a/src/Magnum/Math/Test/Vector4Test.cpp
+++ b/src/Magnum/Math/Test/Vector4Test.cpp
@ -29,6 +29,7 @@

 #include "Magnum/Math/Vector4.h"
 #include "Magnum/Math/StrictWeakOrdering.h"
+#include "Magnum/Math/Swizzle.h"

 struct Vec4 {
    float x, y, z, w;
--- a/src/Magnum/Math/Test/VectorBenchmark.cpp
+++ b/src/Magnum/Math/Test/VectorBenchmark.cpp
@ -27,6 +27,10 @@

 #include "Magnum/Math/Vector3.h"

+#ifdef CORRADE_TARGET_SSE2
+#include <xmmintrin.h>
+#endif
+
 namespace Magnum { namespace Math { namespace Test { namespace {

 struct VectorBenchmark: Corrade::TestSuite::Tester {
@ -39,6 +43,10 @@ struct VectorBenchmark: Corrade::TestSuite::Tester {

    template<class T> void cross3Baseline();
    void cross3();
+    #ifdef CORRADE_TARGET_SSE2
+    void cross3SseNaive();
+    void cross3SseOneShuffleLess();
+    #endif
 };

 VectorBenchmark::VectorBenchmark() {
@ -52,6 +60,10 @@ VectorBenchmark::VectorBenchmark() {
        &VectorBenchmark::cross3Baseline<Float>,
        &VectorBenchmark::cross3Baseline<Double>,
        &VectorBenchmark::cross3,
+        #ifdef CORRADE_TARGET_SSE2
+        &VectorBenchmark::cross3SseNaive,
+        &VectorBenchmark::cross3SseOneShuffleLess,
+        #endif
    }, 500);
 }

@ -149,6 +161,73 @@ void VectorBenchmark::cross3() {
    CORRADE_VERIFY(a != a);
 }

+#ifdef CORRADE_TARGET_SSE2
+inline Vector3 crossSseNaive(const Vector3& a, const Vector3& b) {
+    union {
+        __m128 v;
+        Float s[4];
+    };
+
+    const __m128 aa = _mm_set_ps(0.0f, a[2], a[1], a[0]);
+    const __m128 bb = _mm_set_ps(0.0f, b[2], b[1], b[0]);
+
+    v = _mm_sub_ps(
+        _mm_mul_ps(_mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 0, 2, 1)),
+                   _mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 1, 0, 2))),
+        _mm_mul_ps(_mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 1, 0, 2)),
+                   _mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 0, 2, 1))));
+    return {s[0], s[1], s[2]};
+}
+
+/* https://twitter.com/sjb3d/status/563640846671953920. Originally the
+   Math::cross() was doing this, implemented as
+    gather<'y', 'z', 'x'>(a*gather<'y', 'z', 'x'>(b) -
+                          b*gather<'y', 'z', 'x'>(a))
+   but while slightly faster in Release (on GCC at least) than the
+   straightforward version, it was insanely slow in Debug. */
+inline Vector3 crossSseOneShuffleLess(const Vector3& a, const Vector3& b) {
+    union {
+        __m128 v;
+        Float s[4];
+    };
+
+    const __m128 aa = _mm_set_ps(0.0f, a[2], a[1], a[0]);
+    const __m128 bb = _mm_set_ps(0.0f, b[2], b[1], b[0]);
+    const __m128 cc = _mm_sub_ps(
+        _mm_mul_ps(aa, _mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 0, 2, 1))),
+        _mm_mul_ps(bb, _mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 0, 2, 1))));
+
+    v = _mm_shuffle_ps(cc, cc, _MM_SHUFFLE(3, 0, 2, 1));
+    return {s[0], s[1], s[2]};
+}
+
+void VectorBenchmark::cross3SseNaive() {
+    Vector3 a{1.3f, -1.1f, 1.0f};
+    Vector3 b{4.5f, 3.2f, 7.3f};
+    CORRADE_COMPARE(Test::crossSseNaive(a, b),
+        (Vector3{-11.23f, -4.99f, 9.11f}));
+
+    CORRADE_BENCHMARK(Repeats) {
+        a = Test::crossSseNaive(a, b);
+    }
+
+    CORRADE_VERIFY(a != a);
+}
+
+void VectorBenchmark::cross3SseOneShuffleLess() {
+    Vector3 a{1.3f, -1.1f, 1.0f};
+    Vector3 b{4.5f, 3.2f, 7.3f};
+    CORRADE_COMPARE(Test::crossSseOneShuffleLess(a, b),
+        (Vector3{-11.23f, -4.99f, 9.11f}));
+
+    CORRADE_BENCHMARK(Repeats) {
+        a = Test::crossSseOneShuffleLess(a, b);
+    }
+
+    CORRADE_VERIFY(a != a);
+}
+#endif
+
 }}}}

 CORRADE_TEST_MAIN(Magnum::Math::Test::VectorBenchmark)
--- a/src/Magnum/Math/Vector2.h
+++ b/src/Magnum/Math/Vector2.h
@ -48,7 +48,7 @@ perpendicular. @f[
    @ref dot(const Vector<size, T>&, const Vector<size, T>&)
 */
 template<class T> inline T cross(const Vector2<T>& a, const Vector2<T>& b) {
-    return dot(a.perpendicular(), b);
+    return a._data[0]*b._data[1] - a._data[1]*b._data[0];
 }

 /**
@ -182,6 +182,9 @@ template<class T> class Vector2: public Vector<2, T> {
        aspectRatio() const { return x()/y(); }

        MAGNUM_VECTOR_SUBCLASS_IMPLEMENTATION(2, Vector2)
+
+    private:
+        template<class U> friend U cross(const Vector2<U>&, const Vector2<U>&);
 };

 #ifndef DOXYGEN_GENERATING_OUTPUT
--- a/src/Magnum/Math/Vector3.h
+++ b/src/Magnum/Math/Vector3.h
@ -30,29 +30,25 @@
 */

 #include "Magnum/Math/Vector2.h"
-#include "Magnum/Math/Swizzle.h"

 namespace Magnum { namespace Math {

 /**
@brief Cross product

-Result has length of `0` either when one of them is zero or they are parallel
-or antiparallel and length of `1` when two *normalized* vectors are
-perpendicular. Done using the following equation: @f[
-     \boldsymbol a \times \boldsymbol b = \begin{pmatrix} c_y \\ c_z \\ c_x \end{pmatrix} ~~~~~
-     \boldsymbol c = \boldsymbol a \begin{pmatrix} b_y \\ b_z \\ b_x \end{pmatrix} -
-                     \boldsymbol b \begin{pmatrix} a_y \\ a_z \\ a_x \end{pmatrix}
-@f]
-Which is equivalent to the common one (source:
-https://twitter.com/sjb3d/status/563640846671953920): @f[
-     \boldsymbol a \times \boldsymbol b = \begin{pmatrix}a_yb_z - a_zb_y \\ a_zb_x - a_xb_z \\ a_xb_y - a_yb_x \end{pmatrix}
+Result has length of @cpp 0 @ce either when one of them is zero or they are
+parallel or antiparallel and length of @cpp 1 @ce when two *normalized* vectors
+are perpendicular. @f[
+    \boldsymbol a \times \boldsymbol b = \begin{pmatrix}a_yb_z - a_zb_y \\ a_zb_x - a_xb_z \\ a_xb_y - a_yb_x \end{pmatrix}
@f]
@see @ref cross(const Vector2<T>&, const Vector2<T>&), @ref planeEquation()
 */
 template<class T> inline Vector3<T> cross(const Vector3<T>& a, const Vector3<T>& b) {
-    return gather<'y', 'z', 'x'>(a*gather<'y', 'z', 'x'>(b) -
-                                 b*gather<'y', 'z', 'x'>(a));
+    return {
+        a._data[1]*b._data[2] - b._data[1]*a._data[2],
+        a._data[2]*b._data[0] - b._data[2]*a._data[0],
+        a._data[0]*b._data[1] - b._data[0]*a._data[1]
+    };
 }

 /**
@ -232,6 +228,9 @@ template<class T> class Vector3: public Vector<3, T> {
        } /**< @overload */

        MAGNUM_VECTOR_SUBCLASS_IMPLEMENTATION(3, Vector3)
+
+    private:
+        template<class U> friend Vector3<U> cross(const Vector3<U>&, const Vector3<U>&);
 };

 #ifndef DOXYGEN_GENERATING_OUTPUT