From 573125d0a9f576031b1e5775b926353dbdb8a75e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Tue, 21 Apr 2020 22:20:25 +0200 Subject: [PATCH] Math: make cross() 10x faster in Debug. And the Vector3 version 5% slower in Release, on GCC at least. FFS, what was I thinking with the gather() things. Nice in user code, extremely bad in library code. --- doc/snippets/MagnumMath.cpp | 1 + doc/snippets/MagnumTrade.cpp | 3 +- src/Magnum/Math/Test/ColorTest.cpp | 1 + src/Magnum/Math/Test/Vector2Test.cpp | 1 + src/Magnum/Math/Test/Vector3Test.cpp | 1 + src/Magnum/Math/Test/Vector4Test.cpp | 1 + src/Magnum/Math/Test/VectorBenchmark.cpp | 79 ++++++++++++++++++++++++ src/Magnum/Math/Vector2.h | 5 +- src/Magnum/Math/Vector3.h | 25 ++++---- 9 files changed, 102 insertions(+), 15 deletions(-) diff --git a/doc/snippets/MagnumMath.cpp b/doc/snippets/MagnumMath.cpp index d2bb4bbf0..167b490f3 100644 --- a/doc/snippets/MagnumMath.cpp +++ b/doc/snippets/MagnumMath.cpp @@ -39,6 +39,7 @@ #include "Magnum/Math/Range.h" #include "Magnum/Math/Algorithms/GramSchmidt.h" #include "Magnum/Math/StrictWeakOrdering.h" +#include "Magnum/Math/Swizzle.h" using namespace Magnum; using namespace Magnum::Math::Literals; diff --git a/doc/snippets/MagnumTrade.cpp b/doc/snippets/MagnumTrade.cpp index 77da74394..a0460181c 100644 --- a/doc/snippets/MagnumTrade.cpp +++ b/doc/snippets/MagnumTrade.cpp @@ -32,8 +32,9 @@ #include "Magnum/ImageView.h" #include "Magnum/Mesh.h" #include "Magnum/PixelFormat.h" -#include "Magnum/MeshTools/Interleave.h" #include "Magnum/Animation/Player.h" +#include "Magnum/Math/Swizzle.h" +#include "Magnum/MeshTools/Interleave.h" #include "Magnum/MeshTools/Transform.h" #include "Magnum/Trade/AbstractImporter.h" #include "Magnum/Trade/AnimationData.h" diff --git a/src/Magnum/Math/Test/ColorTest.cpp b/src/Magnum/Math/Test/ColorTest.cpp index 5cef591e6..4c4b6c60b 100644 --- a/src/Magnum/Math/Test/ColorTest.cpp +++ b/src/Magnum/Math/Test/ColorTest.cpp @@ -35,6 +35,7 @@ #include "Magnum/Math/Color.h" #include "Magnum/Math/Half.h" #include "Magnum/Math/StrictWeakOrdering.h" +#include "Magnum/Math/Swizzle.h" struct Vec3 { float x, y, z; diff --git a/src/Magnum/Math/Test/Vector2Test.cpp b/src/Magnum/Math/Test/Vector2Test.cpp index 991956768..dddb036d5 100644 --- a/src/Magnum/Math/Test/Vector2Test.cpp +++ b/src/Magnum/Math/Test/Vector2Test.cpp @@ -29,6 +29,7 @@ #include "Magnum/Math/Vector3.h" /* Vector3 used in Vector2Test::cross() */ #include "Magnum/Math/StrictWeakOrdering.h" +#include "Magnum/Math/Swizzle.h" struct Vec2 { float x, y; diff --git a/src/Magnum/Math/Test/Vector3Test.cpp b/src/Magnum/Math/Test/Vector3Test.cpp index bfeb9030e..2ba0c2935 100644 --- a/src/Magnum/Math/Test/Vector3Test.cpp +++ b/src/Magnum/Math/Test/Vector3Test.cpp @@ -29,6 +29,7 @@ #include "Magnum/Math/Vector3.h" #include "Magnum/Math/StrictWeakOrdering.h" +#include "Magnum/Math/Swizzle.h" struct Vec3 { float x, y, z; diff --git a/src/Magnum/Math/Test/Vector4Test.cpp b/src/Magnum/Math/Test/Vector4Test.cpp index db89add8a..b8f6ce739 100644 --- a/src/Magnum/Math/Test/Vector4Test.cpp +++ b/src/Magnum/Math/Test/Vector4Test.cpp @@ -29,6 +29,7 @@ #include "Magnum/Math/Vector4.h" #include "Magnum/Math/StrictWeakOrdering.h" +#include "Magnum/Math/Swizzle.h" struct Vec4 { float x, y, z, w; diff --git a/src/Magnum/Math/Test/VectorBenchmark.cpp b/src/Magnum/Math/Test/VectorBenchmark.cpp index 6a690d604..e7c0625fc 100644 --- a/src/Magnum/Math/Test/VectorBenchmark.cpp +++ b/src/Magnum/Math/Test/VectorBenchmark.cpp @@ -27,6 +27,10 @@ #include "Magnum/Math/Vector3.h" +#ifdef CORRADE_TARGET_SSE2 +#include +#endif + namespace Magnum { namespace Math { namespace Test { namespace { struct VectorBenchmark: Corrade::TestSuite::Tester { @@ -39,6 +43,10 @@ struct VectorBenchmark: Corrade::TestSuite::Tester { template void cross3Baseline(); void cross3(); + #ifdef CORRADE_TARGET_SSE2 + void cross3SseNaive(); + void cross3SseOneShuffleLess(); + #endif }; VectorBenchmark::VectorBenchmark() { @@ -52,6 +60,10 @@ VectorBenchmark::VectorBenchmark() { &VectorBenchmark::cross3Baseline, &VectorBenchmark::cross3Baseline, &VectorBenchmark::cross3, + #ifdef CORRADE_TARGET_SSE2 + &VectorBenchmark::cross3SseNaive, + &VectorBenchmark::cross3SseOneShuffleLess, + #endif }, 500); } @@ -149,6 +161,73 @@ void VectorBenchmark::cross3() { CORRADE_VERIFY(a != a); } +#ifdef CORRADE_TARGET_SSE2 +inline Vector3 crossSseNaive(const Vector3& a, const Vector3& b) { + union { + __m128 v; + Float s[4]; + }; + + const __m128 aa = _mm_set_ps(0.0f, a[2], a[1], a[0]); + const __m128 bb = _mm_set_ps(0.0f, b[2], b[1], b[0]); + + v = _mm_sub_ps( + _mm_mul_ps(_mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 0, 2, 1)), + _mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 1, 0, 2))), + _mm_mul_ps(_mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 1, 0, 2)), + _mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 0, 2, 1)))); + return {s[0], s[1], s[2]}; +} + +/* https://twitter.com/sjb3d/status/563640846671953920. Originally the + Math::cross() was doing this, implemented as + gather<'y', 'z', 'x'>(a*gather<'y', 'z', 'x'>(b) - + b*gather<'y', 'z', 'x'>(a)) + but while slightly faster in Release (on GCC at least) than the + straightforward version, it was insanely slow in Debug. */ +inline Vector3 crossSseOneShuffleLess(const Vector3& a, const Vector3& b) { + union { + __m128 v; + Float s[4]; + }; + + const __m128 aa = _mm_set_ps(0.0f, a[2], a[1], a[0]); + const __m128 bb = _mm_set_ps(0.0f, b[2], b[1], b[0]); + const __m128 cc = _mm_sub_ps( + _mm_mul_ps(aa, _mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 0, 2, 1))), + _mm_mul_ps(bb, _mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 0, 2, 1)))); + + v = _mm_shuffle_ps(cc, cc, _MM_SHUFFLE(3, 0, 2, 1)); + return {s[0], s[1], s[2]}; +} + +void VectorBenchmark::cross3SseNaive() { + Vector3 a{1.3f, -1.1f, 1.0f}; + Vector3 b{4.5f, 3.2f, 7.3f}; + CORRADE_COMPARE(Test::crossSseNaive(a, b), + (Vector3{-11.23f, -4.99f, 9.11f})); + + CORRADE_BENCHMARK(Repeats) { + a = Test::crossSseNaive(a, b); + } + + CORRADE_VERIFY(a != a); +} + +void VectorBenchmark::cross3SseOneShuffleLess() { + Vector3 a{1.3f, -1.1f, 1.0f}; + Vector3 b{4.5f, 3.2f, 7.3f}; + CORRADE_COMPARE(Test::crossSseOneShuffleLess(a, b), + (Vector3{-11.23f, -4.99f, 9.11f})); + + CORRADE_BENCHMARK(Repeats) { + a = Test::crossSseOneShuffleLess(a, b); + } + + CORRADE_VERIFY(a != a); +} +#endif + }}}} CORRADE_TEST_MAIN(Magnum::Math::Test::VectorBenchmark) diff --git a/src/Magnum/Math/Vector2.h b/src/Magnum/Math/Vector2.h index d5d7b47ef..8f13fa1e9 100644 --- a/src/Magnum/Math/Vector2.h +++ b/src/Magnum/Math/Vector2.h @@ -48,7 +48,7 @@ perpendicular. @f[ @ref dot(const Vector&, const Vector&) */ template inline T cross(const Vector2& a, const Vector2& b) { - return dot(a.perpendicular(), b); + return a._data[0]*b._data[1] - a._data[1]*b._data[0]; } /** @@ -182,6 +182,9 @@ template class Vector2: public Vector<2, T> { aspectRatio() const { return x()/y(); } MAGNUM_VECTOR_SUBCLASS_IMPLEMENTATION(2, Vector2) + + private: + template friend U cross(const Vector2&, const Vector2&); }; #ifndef DOXYGEN_GENERATING_OUTPUT diff --git a/src/Magnum/Math/Vector3.h b/src/Magnum/Math/Vector3.h index ca5cfe119..29f090830 100644 --- a/src/Magnum/Math/Vector3.h +++ b/src/Magnum/Math/Vector3.h @@ -30,29 +30,25 @@ */ #include "Magnum/Math/Vector2.h" -#include "Magnum/Math/Swizzle.h" namespace Magnum { namespace Math { /** @brief Cross product -Result has length of `0` either when one of them is zero or they are parallel -or antiparallel and length of `1` when two *normalized* vectors are -perpendicular. Done using the following equation: @f[ - \boldsymbol a \times \boldsymbol b = \begin{pmatrix} c_y \\ c_z \\ c_x \end{pmatrix} ~~~~~ - \boldsymbol c = \boldsymbol a \begin{pmatrix} b_y \\ b_z \\ b_x \end{pmatrix} - - \boldsymbol b \begin{pmatrix} a_y \\ a_z \\ a_x \end{pmatrix} -@f] -Which is equivalent to the common one (source: -https://twitter.com/sjb3d/status/563640846671953920): @f[ - \boldsymbol a \times \boldsymbol b = \begin{pmatrix}a_yb_z - a_zb_y \\ a_zb_x - a_xb_z \\ a_xb_y - a_yb_x \end{pmatrix} +Result has length of @cpp 0 @ce either when one of them is zero or they are +parallel or antiparallel and length of @cpp 1 @ce when two *normalized* vectors +are perpendicular. @f[ + \boldsymbol a \times \boldsymbol b = \begin{pmatrix}a_yb_z - a_zb_y \\ a_zb_x - a_xb_z \\ a_xb_y - a_yb_x \end{pmatrix} @f] @see @ref cross(const Vector2&, const Vector2&), @ref planeEquation() */ template inline Vector3 cross(const Vector3& a, const Vector3& b) { - return gather<'y', 'z', 'x'>(a*gather<'y', 'z', 'x'>(b) - - b*gather<'y', 'z', 'x'>(a)); + return { + a._data[1]*b._data[2] - b._data[1]*a._data[2], + a._data[2]*b._data[0] - b._data[2]*a._data[0], + a._data[0]*b._data[1] - b._data[0]*a._data[1] + }; } /** @@ -232,6 +228,9 @@ template class Vector3: public Vector<3, T> { } /**< @overload */ MAGNUM_VECTOR_SUBCLASS_IMPLEMENTATION(3, Vector3) + + private: + template friend Vector3 cross(const Vector3&, const Vector3&); }; #ifndef DOXYGEN_GENERATING_OUTPUT