More SIMD features - Extract a standalone simd::sum() function from simd::dot(). - Add column-major (transposed) loads. - Add a simd::cast function for type conversion. - Add rive::UnpackColor4f(), which uses simd to convert a ColorInt to 4 normalized floats. - Bonus: Add an assert to RIVE_UNREACHABLE and add a RIVE_DEBUG_CODE() macro. Diffs= c8b5fdadd More SIMD features
diff --git a/.rive_head b/.rive_head index ecf6ff9..4758ad4 100644 --- a/.rive_head +++ b/.rive_head
@@ -1 +1 @@ -87f079a103eb417091b94ea5ed7eecd90cbc6149 +c8b5fdadd370d61267c0c6352b609f3608f3efd5
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp index 9cb90aa..f042877 100644 --- a/include/rive/math/simd.hpp +++ b/include/rive/math/simd.hpp
@@ -17,8 +17,8 @@ #include "rive/rive_types.hpp" #include <cassert> #include <limits> +#include <math.h> #include <stdint.h> -#include <string.h> #include <tuple> #include <type_traits> @@ -176,6 +176,27 @@ #endif } +template <typename T, int N> RIVE_ALWAYS_INLINE T sum(gvec<T, N> x) +{ + T s = x[0]; + for (int i = 1; i < N; ++i) + s += x[i]; + return s; +} + +// We can use __builtin_reduce_add for integer types. +#if __has_builtin(__builtin_reduce_add) +template <int N> RIVE_ALWAYS_INLINE int32_t sum(gvec<int32_t, N> x) +{ + return __builtin_reduce_add(x); +} + +template <int N> RIVE_ALWAYS_INLINE uint32_t sum(gvec<uint32_t, N> x) +{ + return __builtin_reduce_add(x); +} +#endif + ////// Floating Point Functions ////// template <int N> RIVE_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x) @@ -289,6 +310,16 @@ return x * (numer / denom) + pi_over_2; } +////// Type conversion ////// + +template <typename U, typename T, int N> RIVE_ALWAYS_INLINE gvec<U, N> cast(gvec<T, N> x) +{ + gvec<U, N> y{}; + for (int i = 0; i < N; ++i) + y[i] = static_cast<U>(x[i]); + return y; +} + ////// Loading and storing ////// template <typename T, int N> RIVE_ALWAYS_INLINE gvec<T, N> load(const void* ptr) @@ -309,6 +340,49 @@ RIVE_INLINE_MEMCPY(dst, &vec, sizeof(T) * N); } +////// Column-major (transposed) loads ////// + +#ifdef __ARM_NEON__ +RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>> +load4x4f(const float* matrix) +{ + float32x4x4_t m = vld4q_f32(matrix); + gvec<float, 4> c0, c1, c2, c3; + RIVE_INLINE_MEMCPY(&c0, &m.val[0], sizeof(c0)); + RIVE_INLINE_MEMCPY(&c1, &m.val[1], sizeof(c1)); + RIVE_INLINE_MEMCPY(&c2, &m.val[2], sizeof(c2)); + RIVE_INLINE_MEMCPY(&c3, &m.val[3], sizeof(c3)); + return {c0, c1, c2, c3}; +} +#elif defined(__SSE__) +RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>> +load4x4f(const float* m) +{ + __m128 r0, r1, r2, r3; + RIVE_INLINE_MEMCPY(&r0, m + 4 * 0, sizeof(r0)); + RIVE_INLINE_MEMCPY(&r1, m + 4 * 1, sizeof(r1)); + RIVE_INLINE_MEMCPY(&r2, m + 4 * 2, sizeof(r2)); + RIVE_INLINE_MEMCPY(&r3, m + 4 * 3, sizeof(r3)); + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + gvec<float, 4> c0, c1, c2, c3; + RIVE_INLINE_MEMCPY(&c0, &r0, sizeof(c0)); + RIVE_INLINE_MEMCPY(&c1, &r1, sizeof(c1)); + RIVE_INLINE_MEMCPY(&c2, &r2, sizeof(c2)); + RIVE_INLINE_MEMCPY(&c3, &r3, sizeof(c3)); + return {c0, c1, c2, c3}; +} +#else +RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>> +load4x4f(const float* m) +{ + gvec<float, 4> c0 = {m[0], m[4], m[8], m[12]}; + gvec<float, 4> c1 = {m[1], m[5], m[9], m[13]}; + gvec<float, 4> c2 = {m[2], m[6], m[10], m[14]}; + gvec<float, 4> c3 = {m[3], m[7], m[11], m[15]}; + return {c0, c1, c2, c3}; +} +#endif + template <typename T, int M, int N> RIVE_ALWAYS_INLINE gvec<T, M + N> join(gvec<T, M> a, gvec<T, N> b) { @@ -322,28 +396,9 @@ template <typename T, int N> RIVE_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b) { - auto d = a * b; - T s = d[0]; - for (int i = 1; i < N; ++i) - s += d[i]; - return s; + return sum(a * b); } -// We can use __builtin_reduce_add for integer types. -#if __has_builtin(__builtin_reduce_add) -template <int N> RIVE_ALWAYS_INLINE int32_t dot(gvec<int32_t, N> a, gvec<int32_t, N> b) -{ - auto d = a * b; - return __builtin_reduce_add(d); -} - -template <int N> RIVE_ALWAYS_INLINE uint32_t dot(gvec<uint32_t, N> a, gvec<uint32_t, N> b) -{ - auto d = a * b; - return __builtin_reduce_add(d); -} -#endif - RIVE_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b) { auto c = a * b.yx; @@ -366,8 +421,6 @@ } // namespace simd } // namespace rive -#undef RIVE_INLINE_MEMCPY - namespace rive { template <int N> using vec = simd::gvec<float, N>;
diff --git a/include/rive/math/simd_gvec_polyfill.hpp b/include/rive/math/simd_gvec_polyfill.hpp index a590aab..2066060 100644 --- a/include/rive/math/simd_gvec_polyfill.hpp +++ b/include/rive/math/simd_gvec_polyfill.hpp
@@ -206,6 +206,8 @@ DECL_ARITHMETIC_OP(|); DECL_ARITHMETIC_OP(&); DECL_ARITHMETIC_OP(^); +DECL_ARITHMETIC_OP(<<); +DECL_ARITHMETIC_OP(>>); #undef DECL_ARITHMETIC_OP @@ -249,6 +251,8 @@ { \ return F((gvec<T, N>)x); \ } +#define ENABLE_SWIZZLE_REDUCE(F) \ + template <typename T, int N, Swizzle Z0> T F(gvec<T, N, Z0> x) { return F((gvec<T, N>)x); } #define ENABLE_SWIZZLE1F(F) \ template <int N, Swizzle Z0> gvec<float, N> F(gvec<float, N, Z0> x) \ { \ @@ -287,6 +291,7 @@ } ENABLE_SWIZZLE1(abs) +ENABLE_SWIZZLE_REDUCE(sum) ENABLE_SWIZZLE1F(floor) ENABLE_SWIZZLE1F(ceil) ENABLE_SWIZZLE1F(sqrt) @@ -302,8 +307,13 @@ { store(dst, (gvec<T, N>)vec); } +template <typename U, typename T, int N, Swizzle Z> gvec<U, N> cast(gvec<T, N, Z> x) +{ + return cast<U>((gvec<T, N>)x); +} #undef ENABLE_SWIZZLE1 +#undef ENABLE_SWIZZLE_REDUCE #undef ENABLE_SWIZZLE1F #undef ENABLE_SWIZZLE1B #undef ENABLE_SWIZZLEUT
diff --git a/include/rive/rive_types.hpp b/include/rive/rive_types.hpp index 3450d76..f7d884b 100644 --- a/include/rive/rive_types.hpp +++ b/include/rive/rive_types.hpp
@@ -62,13 +62,18 @@ // Annotations to assert unreachable control flow. #if defined(__GNUC__) || defined(__clang__) -#define RIVE_UNREACHABLE __builtin_unreachable +#define RIVE_UNREACHABLE \ + assert(!(bool)"unreachable reached"); \ + __builtin_unreachable #elif _MSC_VER -#define RIVE_UNREACHABLE() __assume(0) +#define RIVE_UNREACHABLE() \ + assert(!(bool)"unreachable reached"); \ + __assume(0) #else #define RIVE_UNREACHABLE() \ do \ { \ + assert(!(bool)"unreachable reached"); \ } while (0) #endif @@ -107,6 +112,12 @@ #define RIVE_INLINE_MEMCPY memcpy #endif +#ifdef DEBUG +#define RIVE_DEBUG_CODE(CODE) CODE +#else +#define RIVE_DEBUG_CODE(CODE) +#endif + // Backports of later stl functions. namespace rivestd {
diff --git a/include/rive/shapes/paint/color.hpp b/include/rive/shapes/paint/color.hpp index 850fbb3..d83e7f0 100644 --- a/include/rive/shapes/paint/color.hpp +++ b/include/rive/shapes/paint/color.hpp
@@ -17,6 +17,8 @@ unsigned int colorAlpha(ColorInt value); +void UnpackColor4f(ColorInt color, float out[4]); + float colorOpacity(unsigned int value); ColorInt colorWithAlpha(ColorInt value, unsigned int a);
diff --git a/src/shapes/paint/color.cpp b/src/shapes/paint/color.cpp index 059d524..d2deb56 100644 --- a/src/shapes/paint/color.cpp +++ b/src/shapes/paint/color.cpp
@@ -1,4 +1,6 @@ #include "rive/shapes/paint/color.hpp" + +#include "rive/math/simd.hpp" #include <stdio.h> namespace rive @@ -17,6 +19,12 @@ unsigned int colorAlpha(ColorInt value) { return (0xff000000 & value) >> 24; } +void UnpackColor4f(ColorInt color, float out[4]) +{ + float4 color4f = simd::cast<float>(color << uint4{8, 16, 24, 0} >> 24u) / 255.f; + simd::store(out, color4f); +} + float colorOpacity(ColorInt value) { return (float)colorAlpha(value) / 0xFF; } ColorInt colorWithAlpha(ColorInt value, unsigned int a)
diff --git a/test/simd_test.cpp b/test/simd_test.cpp index 2fff755..7a9d3db 100644 --- a/test/simd_test.cpp +++ b/test/simd_test.cpp
@@ -329,6 +329,37 @@ int2{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::min()})); } +// Check simd::sum. +TEST_CASE("sum", "[simd]") +{ + { + float4 v = {1, 2, 3, 4}; + CHECK(simd::sum(v) == 10); + CHECK(simd::sum(v.zwxy) == 10); + CHECK(simd::sum(v.xyz) == 6); + CHECK(simd::sum(v.yz) == 5); + CHECK(simd::sum(v.xy.yxyx) == 6); + } + + { + int4 v = {1, 2, 3, 4}; + CHECK(simd::sum(v) == 10); + CHECK(simd::sum(v.zwxy) == 10); + CHECK(simd::sum(v.xyz) == 6); + CHECK(simd::sum(v.yz) == 5); + CHECK(simd::sum(v.xy.yxyx) == 6); + } + + { + uint4 v = {1, 2, 3, 4}; + CHECK(simd::sum(v) == 10); + CHECK(simd::sum(v.zwxy) == 10); + CHECK(simd::sum(v.xyz) == 6); + CHECK(simd::sum(v.yz) == 5); + CHECK(simd::sum(v.xy.yxyx) == 6); + } +} + // Check simd::floor. TEST_CASE("floor", "[simd]") { @@ -467,6 +498,16 @@ } } +TEST_CASE("cast", "[simd]") +{ + float4 f4 = float4{-1.9f, -1.5f, 1.5f, 1.1f}; + CHECK(simd::all(simd::cast<int>(f4) == int4{-1, -1, 1, 1})); + CHECK(simd::all(simd::cast<int>(simd::floor(f4)) == int4{-2, -2, 1, 1})); + CHECK(simd::all(simd::cast<int>(simd::ceil(f4)) == int4{-1, -1, 2, 2})); + CHECK(simd::all(simd::cast<int>(simd::ceil(f4.zwxy)) == int4{2, 2, -1, -1})); + CHECK(simd::all(simd::cast<int>(simd::ceil(f4).zwxy) == int4{2, 2, -1, -1})); +} + // Check simd::dot. TEST_CASE("dot", "[simd]") { @@ -555,4 +596,17 @@ check_mix<5>(); CHECK_ALL((simd::mix(float4{1, 2, 3, 4}, float4{5, 6, 7, 8}, float4(0)) == float4{1, 2, 3, 4})); } + +// Check simd::load4x4f +TEST_CASE("load4x4f", "[simd]") +{ + // Column major. + float m[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; + auto c = simd::load4x4f(m); + CHECK(simd::all(std::get<0>(c) == float4{0, 1, 2, 3})); + CHECK(simd::all(std::get<1>(c) == float4{4, 5, 6, 7})); + CHECK(simd::all(std::get<2>(c) == float4{8, 9, 10, 11})); + CHECK(simd::all(std::get<3>(c) == float4{12, 13, 14, 15})); +} + } // namespace rive