Browse Source

[wip] Math: SIMD-enabled index array packing/unpacking functions.

TODO: well, the compiler generates the same code as I do with AVX2, so
      there's really no benefit. But could act as a sandbox.
TODO: properly handle cases of element count not being a multiple of 16
TODO: implement other variants
TODO: implement a pack version

[ci skip]
simd
Vladimír Vondruš 7 years ago
parent
commit
07c36b4a64
  1. 71
      src/Magnum/Math/Packing.cpp
  2. 34
      src/Magnum/Math/Packing.h
  3. 50
      src/Magnum/Math/Test/PackingTest.cpp

71
src/Magnum/Math/Packing.cpp

@ -25,6 +25,10 @@
#include "Packing.h"
#include <emmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>
namespace Magnum { namespace Math {
namespace {
@ -101,4 +105,71 @@ UnsignedShort packHalf(const Float value) {
return h;
}
namespace Implementation {
void unpackUnsignedByteToShort(Simd::NoneT, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
for(std::size_t i = 0; i < in.size(); ++i) out[i] = in[i];
}
void unpackUnsignedByteToShort(Simd::Sse2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m128i* out128 = reinterpret_cast<__m128i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_loadu_si128(in128 + i);
_mm_storeu_si128(out128 + i*2 + 0, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
_mm_storeu_si128(out128 + i*2 + 1, _mm_unpacklo_epi8(a, _mm_setzero_si128()));
}
}
void unpackUnsignedByteToShort(Simd::Sse41T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m128i* out128 = reinterpret_cast<__m128i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_loadu_si128(in128 + i);
_mm_storeu_si128(out128 + i*2 + 0, _mm_cvtepu8_epi16(a));
_mm_storeu_si128(out128 + i*2 + 1, _mm_cvtepu8_epi16(_mm_srli_si128(a, 8)));
}
}
void unpackUnsignedByteToShort(Simd::Avx2T, const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
const __m128i* in128 = reinterpret_cast<const __m128i*>(in.data());
__m256i* out256 = reinterpret_cast<__m256i*>(out.data());
for(std::size_t i = 0; i < in.size()/16; ++i) {
__m128i a = _mm_load_si128(in128 + i);
_mm256_store_si256(out256 + i, _mm256_cvtepu8_epi16(a));
}
}
}
namespace {
__attribute__ ((target ("default"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
}
// TODO: why gcc complains about unused functions here?!
__attribute__ ((target ("sse2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out);
}
__attribute__ ((target ("sse4.1"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
}
__attribute__ ((target ("avx2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out);
}
}
void unpackUnsignedByteToShort(const Corrade::Containers::ArrayView<const UnsignedByte> in, const Corrade::Containers::ArrayView<UnsignedShort> out) {
CORRADE_ASSERT(in.size() == out.size(), "Math::unpackUnsignedByteToShort(): input has" << in.size() << "elements while output has" << out.size(), );
CORRADE_ASSERT(!(reinterpret_cast<std::uintptr_t>(in.data())%16) && !(reinterpret_cast<std::uintptr_t>(in.data())%16), "Math::unpackUnsignedByteToShort(): the data are not 16-byte aligned", );
/** @todo run only for a multiple of 16, do the rest scalar */
CORRADE_INTERNAL_ASSERT(!(in.size()%16));
unpackUnsignedByteToShortDispatch(in, out);
}
}}

34
src/Magnum/Math/Packing.h

@ -30,6 +30,7 @@
*/
#include "Magnum/Math/Functions.h"
#include "Magnum/Math/Simd.h"
namespace Magnum { namespace Math {
@ -209,6 +210,39 @@ template<std::size_t size> Vector<size, Float> unpackHalf(const Vector<size, Uns
return out;
}
namespace Implementation {
// TODO: expose these publicly? would make sense, otherwise the tags are useless
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::NoneT, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse41T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Avx2T, Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
}
/**
@brief Unpack an array of 8-bit unsigned integers to 16-bit
The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
// TODO: mention SIMD?
MAGNUM_EXPORT void unpackUnsignedByteToShort(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedShort> out);
/**
@brief Unpack an array of 8-bit unsigned integers to 32-bit
The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
MAGNUM_EXPORT void unpackUnsignedByteToInt(Corrade::Containers::ArrayView<const UnsignedByte> in, Corrade::Containers::ArrayView<UnsignedInt> out);
/**
@brief Unpack an array of 16-bit unsigned integers to 32-bit
The @p in and @p out are expected to have the same size and be aligned to 16
bytes.
*/
MAGNUM_EXPORT void unpackUnsignedShortToInt(Corrade::Containers::ArrayView<const UnsignedShort> in, Corrade::Containers::ArrayView<UnsignedInt> out);
}}
#endif

50
src/Magnum/Math/Test/PackingTest.cpp

@ -24,6 +24,7 @@
*/
#include <limits>
#include <Corrade/Containers/Array.h>
#include <Corrade/TestSuite/Tester.h>
#include "Magnum/Math/Packing.h"
@ -46,6 +47,9 @@ struct PackingTest: Corrade::TestSuite::Tester {
/* Half (un)pack functions are tested and benchmarked in HalfTest.cpp,
because there's involved comparison and benchmarks to ground truth */
void unpackUnsignedByteToShortBenchmark();
template<class T> void unpackUnsignedByteToShortBenchmark();
};
typedef Math::Vector3<Float> Vector3;
@ -62,6 +66,14 @@ PackingTest::PackingTest() {
&PackingTest::reunpackUnsinged,
&PackingTest::reunpackSinged,
&PackingTest::unpackTypeDeduction});
addBenchmarks<PackingTest>({
&PackingTest::unpackUnsignedByteToShortBenchmark,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::NoneT>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse2T>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Sse41T>,
&PackingTest::unpackUnsignedByteToShortBenchmark<Simd::Avx2T>
}, 1000);
}
void PackingTest::bitMax() {
@ -279,6 +291,44 @@ void PackingTest::unpackTypeDeduction() {
CORRADE_COMPARE((Math::unpack<Float, Byte>('\x7F')), 1.0f);
}
void PackingTest::unpackUnsignedByteToShortBenchmark() {
Corrade::Containers::Array<UnsignedByte> in{20000};
Corrade::Containers::Array<UnsignedShort> out{20000};
UnsignedByte a = 0;
for(auto& i: in) i = a++;
CORRADE_BENCHMARK(100)
unpackUnsignedByteToShort(in, out);
}
// TODO: uh provide this elsewhere
template<class> struct SimdTraits;
template<> struct SimdTraits<Simd::NoneT> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::NoneT>"; }
};
template<> struct SimdTraits<Simd::Sse2T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse2T>"; }
};
template<> struct SimdTraits<Simd::Sse41T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Sse41T>"; }
};
template<> struct SimdTraits<Simd::Avx2T> {
static const char* name() { return "unpackUnsignedByteToShortBenchmark<Simd::Avx2T>"; }
};
template<class T> void PackingTest::unpackUnsignedByteToShortBenchmark() {
setTestCaseName(SimdTraits<T>::name());
Corrade::Containers::Array<UnsignedByte> in{20000};
Corrade::Containers::Array<UnsignedShort> out{20000};
UnsignedByte a = 0;
for(auto& i: in) i = a++;
CORRADE_BENCHMARK(100)
// TODO: uh the typename wat
Implementation::unpackUnsignedByteToShort(T{typename T::Init{}}, in, out);
}
}}}}
CORRADE_TEST_MAIN(Magnum::Math::Test::PackingTest)

Loading…
Cancel
Save