Browse Source

Merge 07c36b4a64 into e6db895452

pull/306/merge
Vladimír Vondruš 7 years ago committed by GitHub
parent
commit
9f7b78a9ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      src/Magnum/Math/CMakeLists.txt
  2. 71
      src/Magnum/Math/Packing.cpp
  3. 34
      src/Magnum/Math/Packing.h
  4. 148
      src/Magnum/Math/Simd.h
  5. 50
      src/Magnum/Math/Test/PackingTest.cpp

1
src/Magnum/Math/CMakeLists.txt

@ -48,6 +48,7 @@ set(MagnumMath_HEADERS
Packing.h Packing.h
Range.h Range.h
RectangularMatrix.h RectangularMatrix.h
Simd.h
StrictWeakOrdering.h StrictWeakOrdering.h
Swizzle.h Swizzle.h
Tags.h Tags.h

71
src/Magnum/Math/Packing.cpp

@ -25,6 +25,10 @@
#include "Packing.h" #include "Packing.h"
#include <emmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>
namespace Magnum { namespace Math { namespace Magnum { namespace Math {
namespace { namespace {
@ -101,4 +105,71 @@ UnsignedShort packHalf(const Float value) {
return h; return h;
} }
namespace Implementation {
void unpackUnsignedByteToShort(Simd::NoneT, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
for(std::size_t i = 0; i < in.size(); ++i) out[i] = in[i];
}
void unpackUnsignedByteToShort(Simd::Sse2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m128i* out128 = reinterpret_cast<__m128i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_loadu_si128(in128 + i);
_mm_storeu_si128(out128 + i*2 + 0, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
_mm_storeu_si128(out128 + i*2 + 1, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
}
}
void unpackUnsignedByteToShort(Simd::Sse41T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m128i* out128 = reinterpret_cast<__m128i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_loadu_si128(in128 + i);
_mm_storeu_si128(out128 + i*2 + 0, _mm_cvtepu8_epi16(a));
_mm_storeu_si128(out128 + i*2 + 1, _mm_cvtepu8_epi16(_mm_srli_si128(a, 8)));
}
}
void unpackUnsignedByteToShort(Simd::Avx2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m256i* out256 = reinterpret_cast<__m256i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_load_si128(in128 + i);
_mm256_store_si256(out256 + i, _mm256_cvtepu8_epi16(a));
}
}
}
namespace {
__attribute__ ((target ("default"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
}
// TODO: why gcc complains about unused functions here?!
__attribute__ ((target ("sse2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
}
__attribute__ ((target ("sse4.1"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
}
__attribute__ ((target ("avx2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
}
}
void unpackUnsignedByteToShort(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
CORRADE_ASSERT(in.size() == out.size(), "Math::unpackUnsignedByteToShort(): input has" << in.size() << "elements while output has" << out.size(), );
CORRADE_ASSERT(!(reinterpret_cast<std::uintptr_t>(in.data())%16) && !(reinterpret_cast<std::uintptr_t>(in.data())%16), "Math::unpackUnsignedByteToShort(): the data are not 16-byte aligned", );
/** @todo run only for a multiple of 16, do the rest scalar */
CORRADE_INTERNAL_ASSERT(!(in.size()%16));
unpackUnsignedByteToShortDispatch(in, out);
}
}} }}

34
src/Magnum/Math/Packing.h

@ -30,6 +30,7 @@
*/ */
#include "Magnum/Math/Functions.h" #include "Magnum/Math/Functions.h"
#include "Magnum/Math/Simd.h"
namespace Magnum { namespace Math { namespace Magnum { namespace Math {
@ -209,6 +210,39 @@ template<std::size_t size> Vector<size, Float> unpackHalf(const Vector<size, Uns
return out; return out;
} }
namespace Implementation {
// TODO: expose these publicly? would make sense, otherwise the tags are useless
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::NoneT, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse41T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Avx2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
}
/**
@brief Unpack an array of 8-bit unsigned integers to 16-bit
The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
// TODO: mention SIMD?
MAGNUM_EXPORT void unpackUnsignedByteToShort(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
/**
@brief Unpack an array of 8-bit unsigned integers to 32-bit
The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
MAGNUM_EXPORT void unpackUnsignedByteToInt(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedInt> out);
/**
@brief Unpack an array of 16-bit unsigned integers to 32-bit
The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
MAGNUM_EXPORT void unpackUnsignedShortToInt(Corrade::Containers::ArrayView<const UnsignedShort> in, Corrade::Containers::ArrayView<UnsignedInt> out);
}} }}
#endif #endif

148
src/Magnum/Math/Simd.h

@ -0,0 +1,148 @@
#ifndef Magnum_Math_Simd_h
#define Magnum_Math_Simd_h
/*
This file is part of Magnum.
Copyright © 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019
Vladimír Vondruš <mosra@centrum.cz>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
/** @file
* @brief Namespace @ref Magnum::Math::Simd
*/
/** @namespace Magnum::Math::Simd
@brief SIMD dispatch tags
Tags for dispatching to particular SIMD-optimized versions of batch math
algorithms.
This library is built as part of Magnum by default. To use this library with
CMake, you need to find the `Magnum` package and link to the `Magnum::Magnum`
target:
@code{.cmake}
find_package(Magnum REQUIRED)
# ...
target_link_libraries(your-app Magnum::Magnum)
@endcode
See @ref building and @ref cmake for more information.
*/
namespace Magnum { namespace Math { namespace Simd {
/**
@brief No SIMD acceleration tag type
Used to distinguish algorithms that have no explicit SIMD optimizations, apart
from compiler magic.
@see @ref None
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct NoneT {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit NoneT(Init) {}
#endif
};
/**
@brief SSE2 SIMD acceleration tag type
Used to distinguish algorithms that use at most the
[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set.
@see @ref Sse2
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct Sse2T {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit Sse2T(Init) {}
#endif
};
/**
@brief SSE4.1 SIMD acceleration tag type
Used to distinguish algorithms that use at most the
[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set.
@see @ref Sse41
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct Sse41T {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit Sse41T(Init) {}
#endif
};
/**
@brief AVX2 SIMD acceleration tag type
Used to distinguish algorithms that use at most the
[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2)
instruction set.
@see @ref Avx2
*/
/* Explicit constructor to avoid ambiguous calls when using {} */
struct Avx2T {
#ifndef DOXYGEN_GENERATING_OUTPUT
struct Init{};
constexpr explicit Avx2T(Init) {}
#endif
};
/**
@brief No SIMD acceleration tag
Use for selecting algorithms with no explicit SIMD optimizations.
*/
constexpr NoneT None{NoneT::Init{}};
/**
@brief SSE2 SIMD acceleration tag
Use for selecting algorithms that use at most the
[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set.
*/
constexpr Sse2T Sse2{Sse2T::Init{}};
/**
@brief SSE4.1 SIMD acceleration tag
Use for selecting algorithms that use at most the
[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set.
*/
constexpr Sse41T Sse41{Sse41T::Init{}};
/**
@brief AVX2 SIMD acceleration tag type
Use for selecting algorithms that use at most the
[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2)
instruction set.
*/
constexpr Avx2T Avx2{Avx2T::Init{}};
}}}
#endif

50
src/Magnum/Math/Test/PackingTest.cpp

@ -24,6 +24,7 @@
*/ */
#include <limits> #include <limits>
#include <Corrade/Containers/Array.h>
#include <Corrade/TestSuite/Tester.h> #include <Corrade/TestSuite/Tester.h>
#include "Magnum/Math/Packing.h" #include "Magnum/Math/Packing.h"
@ -46,6 +47,9 @@ struct PackingTest: Corrade::TestSuite::Tester {
/* Half (un)pack functions are tested and benchmarked in HalfTest.cpp, /* Half (un)pack functions are tested and benchmarked in HalfTest.cpp,
because there's involved comparison and benchmarks to ground truth */ because there's involved comparison and benchmarks to ground truth */
void unpackUnsignedByteToShortBenchmark();
template<class T> void unpackUnsignedByteToShortBenchmark();
}; };
typedef Math::Vector3<Float> Vector3; typedef Math::Vector3<Float> Vector3;
@ -62,6 +66,14 @@ PackingTest::PackingTest() {
&PackingTest::reunpackUnsinged, &PackingTest::reunpackUnsinged,
&PackingTest::reunpackSinged, &PackingTest::reunpackSinged,
&PackingTest::unpackTypeDeduction}); &PackingTest::unpackTypeDeduction});
addBenchmarks<PackingTest>({
&PackingTest::unpackUnsignedByteToShortBenchmark,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::NoneT>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse2T>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse41T>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Avx2T>
}, 1000);
} }
void PackingTest::bitMax() { void PackingTest::bitMax() {
@ -279,6 +291,44 @@ void PackingTest::unpackTypeDeduction() {
CORRADE_COMPARE((Math::unpack<Float, Byte>('\x7F')), 1.0f); CORRADE_COMPARE((Math::unpack<Float, Byte>('\x7F')), 1.0f);
} }
void PackingTest::unpackUnsignedByteToShortBenchmark() {
Corrade::Containers::Array<UnsignedByte> in{20000};
Corrade::Containers::Array<UnsignedShort> out{20000};
UnsignedByte a = 0;
for(auto& i: in) i = a++;
CORRADE_BENCHMARK(100)
unpackUnsignedByteToShort(in, out);
}
// TODO: uh provide this elsewhere
template<class> struct SimdTraits;
template<> struct SimdTraits<Simd::NoneT> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::NoneT>"; }
};
template<> struct SimdTraits<Simd::Sse2T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse2T>"; }
};
template<> struct SimdTraits<Simd::Sse41T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse41T>"; }
};
template<> struct SimdTraits<Simd::Avx2T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Avx2T>"; }
};
template<class T> void PackingTest::unpackUnsignedByteToShortBenchmark() {
setTestCaseName(SimdTraits<T>::name());
Corrade::Containers::Array<UnsignedByte> in{20000};
Corrade::Containers::Array<UnsignedShort> out{20000};
UnsignedByte a = 0;
for(auto& i: in) i = a++;
CORRADE_BENCHMARK(100)
// TODO: uh the typename wat
Implementation::unpackUnsignedByteToShort(T{typename T::Init{}}, in, out);
}
}}}} }}}}
CORRADE_TEST_MAIN(Magnum::Math::Test::PackingTest) CORRADE_TEST_MAIN(Magnum::Math::Test::PackingTest)

Loading…
Cancel
Save