diff --git a/src/Magnum/Math/Packing.cpp b/src/Magnum/Math/Packing.cpp index 4d61646d1..a4e54840e 100644 --- a/src/Magnum/Math/Packing.cpp +++ b/src/Magnum/Math/Packing.cpp @@ -25,6 +25,10 @@ #include "Packing.h" +#include +#include +#include + namespace Magnum { namespace Math { namespace { @@ -101,4 +105,71 @@ UnsignedShort packHalf(const Float value) { return h; } +namespace Implementation { + +void unpackUnsignedByteToShort(Simd::NoneT, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + for(std::size_t i = 0; i < in.size(); ++i) out[i] = in[i]; +} + +void unpackUnsignedByteToShort(Simd::Sse2T, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + const __m128i* in128 = reinterpret_cast(in.data()); + __m128i* out128 = reinterpret_cast<__m128i*>(out.data()); + for(std::size_t i = 0; i < in.size()/16; ++i) { + __m128i a = _mm_loadu_si128(in128 + i); + _mm_storeu_si128(out128 + i*2 + 0, _mm_unpacklo_epi8(a, _mm_setzero_si128())); + _mm_storeu_si128(out128 + i*2 + 1, _mm_unpacklo_epi8(a, _mm_setzero_si128())); + } +} + +void unpackUnsignedByteToShort(Simd::Sse41T, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + const __m128i* in128 = reinterpret_cast(in.data()); + __m128i* out128 = reinterpret_cast<__m128i*>(out.data()); + for(std::size_t i = 0; i < in.size()/16; ++i) { + __m128i a = _mm_loadu_si128(in128 + i); + _mm_storeu_si128(out128 + i*2 + 0, _mm_cvtepu8_epi16(a)); + _mm_storeu_si128(out128 + i*2 + 1, _mm_cvtepu8_epi16(_mm_srli_si128(a, 8))); + } +} + +void unpackUnsignedByteToShort(Simd::Avx2T, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + const __m128i* in128 = reinterpret_cast(in.data()); + __m256i* out256 = reinterpret_cast<__m256i*>(out.data()); + for(std::size_t i = 0; i < in.size()/16; ++i) { + __m128i a = _mm_load_si128(in128 + i); + _mm256_store_si256(out256 + i, _mm256_cvtepu8_epi16(a)); + } +} + +} + +namespace { + +__attribute__ ((target ("default"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out); +} + +// TODO: why gcc complains about unused functions here?! +__attribute__ ((target ("sse2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out); +} + +__attribute__ ((target ("sse4.1"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out); +} + +__attribute__ ((target ("avx2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out); +} + +} + +void unpackUnsignedByteToShort(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + CORRADE_ASSERT(in.size() == out.size(), "Math::unpackUnsignedByteToShort(): input has" << in.size() << "elements while output has" << out.size(), ); + CORRADE_ASSERT(!(reinterpret_cast(in.data())%16) && !(reinterpret_cast(in.data())%16), "Math::unpackUnsignedByteToShort(): the data are not 16-byte aligned", ); + + /** @todo run only for a multiple of 16, do the rest scalar */ + CORRADE_INTERNAL_ASSERT(!(in.size()%16)); + unpackUnsignedByteToShortDispatch(in, out); +} + }} diff --git a/src/Magnum/Math/Packing.h b/src/Magnum/Math/Packing.h index c12f77f70..12c6edf9d 100644 --- a/src/Magnum/Math/Packing.h +++ b/src/Magnum/Math/Packing.h @@ -30,6 +30,7 @@ */ #include "Magnum/Math/Functions.h" +#include "Magnum/Math/Simd.h" namespace Magnum { namespace Math { @@ -209,6 +210,39 @@ template Vector unpackHalf(const Vector in, Corrade::Containers::ArrayView out); + MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse2T, Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse41T, Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Avx2T, Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); +} + +/** +@brief Unpack an array of 8-bit unsigned integers to 16-bit + +The @p in and @p out are expected to have the same size and be aligned to 16 +bytes. +*/ +// TODO: mention SIMD? +MAGNUM_EXPORT void unpackUnsignedByteToShort(Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + +/** +@brief Unpack an array of 8-bit unsigned integers to 32-bit + +The @p in and @p out are expected to have the same size and be aligned to 16 +bytes. +*/ +MAGNUM_EXPORT void unpackUnsignedByteToInt(Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + +/** +@brief Unpack an array of 16-bit unsigned integers to 32-bit + +The @p in and @p out are expected to have the same size and be aligned to 16 +bytes. +*/ +MAGNUM_EXPORT void unpackUnsignedShortToInt(Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + }} #endif diff --git a/src/Magnum/Math/Test/PackingTest.cpp b/src/Magnum/Math/Test/PackingTest.cpp index 759e7328e..b335efdfb 100644 --- a/src/Magnum/Math/Test/PackingTest.cpp +++ b/src/Magnum/Math/Test/PackingTest.cpp @@ -24,6 +24,7 @@ */ #include +#include #include #include "Magnum/Math/Packing.h" @@ -46,6 +47,9 @@ struct PackingTest: Corrade::TestSuite::Tester { /* Half (un)pack functions are tested and benchmarked in HalfTest.cpp, because there's involved comparison and benchmarks to ground truth */ + + void unpackUnsignedByteToShortBenchmark(); + template void unpackUnsignedByteToShortBenchmark(); }; typedef Math::Vector3 Vector3; @@ -62,6 +66,14 @@ PackingTest::PackingTest() { &PackingTest::reunpackUnsinged, &PackingTest::reunpackSinged, &PackingTest::unpackTypeDeduction}); + + addBenchmarks({ + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark + }, 1000); } void PackingTest::bitMax() { @@ -279,6 +291,44 @@ void PackingTest::unpackTypeDeduction() { CORRADE_COMPARE((Math::unpack('\x7F')), 1.0f); } +void PackingTest::unpackUnsignedByteToShortBenchmark() { + Corrade::Containers::Array in{20000}; + Corrade::Containers::Array out{20000}; + UnsignedByte a = 0; + for(auto& i: in) i = a++; + + CORRADE_BENCHMARK(100) + unpackUnsignedByteToShort(in, out); +} + +// TODO: uh provide this elsewhere +template struct SimdTraits; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; + +template void PackingTest::unpackUnsignedByteToShortBenchmark() { + setTestCaseName(SimdTraits::name()); + + Corrade::Containers::Array in{20000}; + Corrade::Containers::Array out{20000}; + UnsignedByte a = 0; + for(auto& i: in) i = a++; + + CORRADE_BENCHMARK(100) + // TODO: uh the typename wat + Implementation::unpackUnsignedByteToShort(T{typename T::Init{}}, in, out); +} + }}}} CORRADE_TEST_MAIN(Magnum::Math::Test::PackingTest)