Browse Source

Math: make cross() 10x faster in Debug.

And the Vector3 version 5% slower in Release, on GCC at least. FFS,
what was I thinking with the gather() things. Nice in user code,
extremely bad in library code.
catastrophic-cross
Vladimír Vondruš 6 years ago
parent
commit
573125d0a9
  1. 1
      doc/snippets/MagnumMath.cpp
  2. 3
      doc/snippets/MagnumTrade.cpp
  3. 1
      src/Magnum/Math/Test/ColorTest.cpp
  4. 1
      src/Magnum/Math/Test/Vector2Test.cpp
  5. 1
      src/Magnum/Math/Test/Vector3Test.cpp
  6. 1
      src/Magnum/Math/Test/Vector4Test.cpp
  7. 79
      src/Magnum/Math/Test/VectorBenchmark.cpp
  8. 5
      src/Magnum/Math/Vector2.h
  9. 25
      src/Magnum/Math/Vector3.h

1
doc/snippets/MagnumMath.cpp

@ -39,6 +39,7 @@
#include "Magnum/Math/Range.h"
#include "Magnum/Math/Algorithms/GramSchmidt.h"
#include "Magnum/Math/StrictWeakOrdering.h"
#include "Magnum/Math/Swizzle.h"
using namespace Magnum;
using namespace Magnum::Math::Literals;

3
doc/snippets/MagnumTrade.cpp

@ -32,8 +32,9 @@
#include "Magnum/ImageView.h"
#include "Magnum/Mesh.h"
#include "Magnum/PixelFormat.h"
#include "Magnum/MeshTools/Interleave.h"
#include "Magnum/Animation/Player.h"
#include "Magnum/Math/Swizzle.h"
#include "Magnum/MeshTools/Interleave.h"
#include "Magnum/MeshTools/Transform.h"
#include "Magnum/Trade/AbstractImporter.h"
#include "Magnum/Trade/AnimationData.h"

1
src/Magnum/Math/Test/ColorTest.cpp

@ -35,6 +35,7 @@
#include "Magnum/Math/Color.h"
#include "Magnum/Math/Half.h"
#include "Magnum/Math/StrictWeakOrdering.h"
#include "Magnum/Math/Swizzle.h"
struct Vec3 {
float x, y, z;

1
src/Magnum/Math/Test/Vector2Test.cpp

@ -29,6 +29,7 @@
#include "Magnum/Math/Vector3.h" /* Vector3 used in Vector2Test::cross() */
#include "Magnum/Math/StrictWeakOrdering.h"
#include "Magnum/Math/Swizzle.h"
struct Vec2 {
float x, y;

1
src/Magnum/Math/Test/Vector3Test.cpp

@ -29,6 +29,7 @@
#include "Magnum/Math/Vector3.h"
#include "Magnum/Math/StrictWeakOrdering.h"
#include "Magnum/Math/Swizzle.h"
struct Vec3 {
float x, y, z;

1
src/Magnum/Math/Test/Vector4Test.cpp

@ -29,6 +29,7 @@
#include "Magnum/Math/Vector4.h"
#include "Magnum/Math/StrictWeakOrdering.h"
#include "Magnum/Math/Swizzle.h"
struct Vec4 {
float x, y, z, w;

79
src/Magnum/Math/Test/VectorBenchmark.cpp

@ -27,6 +27,10 @@
#include "Magnum/Math/Vector3.h"
#ifdef CORRADE_TARGET_SSE2
#include <xmmintrin.h>
#endif
namespace Magnum { namespace Math { namespace Test { namespace {
struct VectorBenchmark: Corrade::TestSuite::Tester {
@ -39,6 +43,10 @@ struct VectorBenchmark: Corrade::TestSuite::Tester {
template<class T> void cross3Baseline();
void cross3();
#ifdef CORRADE_TARGET_SSE2
void cross3SseNaive();
void cross3SseOneShuffleLess();
#endif
};
VectorBenchmark::VectorBenchmark() {
@ -52,6 +60,10 @@ VectorBenchmark::VectorBenchmark() {
&VectorBenchmark::cross3Baseline<Float>,
&VectorBenchmark::cross3Baseline<Double>,
&VectorBenchmark::cross3,
#ifdef CORRADE_TARGET_SSE2
&VectorBenchmark::cross3SseNaive,
&VectorBenchmark::cross3SseOneShuffleLess,
#endif
}, 500);
}
@ -149,6 +161,73 @@ void VectorBenchmark::cross3() {
CORRADE_VERIFY(a != a);
}
#ifdef CORRADE_TARGET_SSE2
inline Vector3 crossSseNaive(const Vector3& a, const Vector3& b) {
union {
__m128 v;
Float s[4];
};
const __m128 aa = _mm_set_ps(0.0f, a[2], a[1], a[0]);
const __m128 bb = _mm_set_ps(0.0f, b[2], b[1], b[0]);
v = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 0, 2, 1)),
_mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 1, 0, 2)),
_mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 0, 2, 1))));
return {s[0], s[1], s[2]};
}
/* https://twitter.com/sjb3d/status/563640846671953920. Originally the
Math::cross() was doing this, implemented as
gather<'y', 'z', 'x'>(a*gather<'y', 'z', 'x'>(b) -
b*gather<'y', 'z', 'x'>(a))
but while slightly faster in Release (on GCC at least) than the
straightforward version, it was insanely slow in Debug. */
inline Vector3 crossSseOneShuffleLess(const Vector3& a, const Vector3& b) {
union {
__m128 v;
Float s[4];
};
const __m128 aa = _mm_set_ps(0.0f, a[2], a[1], a[0]);
const __m128 bb = _mm_set_ps(0.0f, b[2], b[1], b[0]);
const __m128 cc = _mm_sub_ps(
_mm_mul_ps(aa, _mm_shuffle_ps(bb, bb, _MM_SHUFFLE(3, 0, 2, 1))),
_mm_mul_ps(bb, _mm_shuffle_ps(aa, aa, _MM_SHUFFLE(3, 0, 2, 1))));
v = _mm_shuffle_ps(cc, cc, _MM_SHUFFLE(3, 0, 2, 1));
return {s[0], s[1], s[2]};
}
void VectorBenchmark::cross3SseNaive() {
Vector3 a{1.3f, -1.1f, 1.0f};
Vector3 b{4.5f, 3.2f, 7.3f};
CORRADE_COMPARE(Test::crossSseNaive(a, b),
(Vector3{-11.23f, -4.99f, 9.11f}));
CORRADE_BENCHMARK(Repeats) {
a = Test::crossSseNaive(a, b);
}
CORRADE_VERIFY(a != a);
}
void VectorBenchmark::cross3SseOneShuffleLess() {
Vector3 a{1.3f, -1.1f, 1.0f};
Vector3 b{4.5f, 3.2f, 7.3f};
CORRADE_COMPARE(Test::crossSseOneShuffleLess(a, b),
(Vector3{-11.23f, -4.99f, 9.11f}));
CORRADE_BENCHMARK(Repeats) {
a = Test::crossSseOneShuffleLess(a, b);
}
CORRADE_VERIFY(a != a);
}
#endif
}}}}
CORRADE_TEST_MAIN(Magnum::Math::Test::VectorBenchmark)

5
src/Magnum/Math/Vector2.h

@ -48,7 +48,7 @@ perpendicular. @f[
@ref dot(const Vector<size, T>&, const Vector<size, T>&)
*/
template<class T> inline T cross(const Vector2<T>& a, const Vector2<T>& b) {
return dot(a.perpendicular(), b);
return a._data[0]*b._data[1] - a._data[1]*b._data[0];
}
/**
@ -182,6 +182,9 @@ template<class T> class Vector2: public Vector<2, T> {
aspectRatio() const { return x()/y(); }
MAGNUM_VECTOR_SUBCLASS_IMPLEMENTATION(2, Vector2)
private:
template<class U> friend U cross(const Vector2<U>&, const Vector2<U>&);
};
#ifndef DOXYGEN_GENERATING_OUTPUT

25
src/Magnum/Math/Vector3.h

@ -30,29 +30,25 @@
*/
#include "Magnum/Math/Vector2.h"
#include "Magnum/Math/Swizzle.h"
namespace Magnum { namespace Math {
/**
@brief Cross product
Result has length of `0` either when one of them is zero or they are parallel
or antiparallel and length of `1` when two *normalized* vectors are
perpendicular. Done using the following equation: @f[
\boldsymbol a \times \boldsymbol b = \begin{pmatrix} c_y \\ c_z \\ c_x \end{pmatrix} ~~~~~
\boldsymbol c = \boldsymbol a \begin{pmatrix} b_y \\ b_z \\ b_x \end{pmatrix} -
\boldsymbol b \begin{pmatrix} a_y \\ a_z \\ a_x \end{pmatrix}
@f]
Which is equivalent to the common one (source:
https://twitter.com/sjb3d/status/563640846671953920): @f[
\boldsymbol a \times \boldsymbol b = \begin{pmatrix}a_yb_z - a_zb_y \\ a_zb_x - a_xb_z \\ a_xb_y - a_yb_x \end{pmatrix}
Result has length of @cpp 0 @ce either when one of them is zero or they are
parallel or antiparallel and length of @cpp 1 @ce when two *normalized* vectors
are perpendicular. @f[
\boldsymbol a \times \boldsymbol b = \begin{pmatrix}a_yb_z - a_zb_y \\ a_zb_x - a_xb_z \\ a_xb_y - a_yb_x \end{pmatrix}
@f]
@see @ref cross(const Vector2<T>&, const Vector2<T>&), @ref planeEquation()
*/
template<class T> inline Vector3<T> cross(const Vector3<T>& a, const Vector3<T>& b) {
return gather<'y', 'z', 'x'>(a*gather<'y', 'z', 'x'>(b) -
b*gather<'y', 'z', 'x'>(a));
return {
a._data[1]*b._data[2] - b._data[1]*a._data[2],
a._data[2]*b._data[0] - b._data[2]*a._data[0],
a._data[0]*b._data[1] - b._data[0]*a._data[1]
};
}
/**
@ -232,6 +228,9 @@ template<class T> class Vector3: public Vector<3, T> {
} /**< @overload */
MAGNUM_VECTOR_SUBCLASS_IMPLEMENTATION(3, Vector3)
private:
template<class U> friend Vector3<U> cross(const Vector3<U>&, const Vector3<U>&);
};
#ifndef DOXYGEN_GENERATING_OUTPUT

Loading…
Cancel
Save