More SIMD features
- Extract a standalone simd::sum() function from simd::dot().
- Add column-major (transposed) loads.
- Add a simd::cast function for type conversion.
- Add rive::UnpackColor4f(), which uses simd to convert a ColorInt to 4 normalized floats.
- Bonus: Add an assert to RIVE_UNREACHABLE and add a RIVE_DEBUG_CODE() macro.
Diffs=
c8b5fdadd More SIMD features
diff --git a/.rive_head b/.rive_head
index ecf6ff9..4758ad4 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-87f079a103eb417091b94ea5ed7eecd90cbc6149
+c8b5fdadd370d61267c0c6352b609f3608f3efd5
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp
index 9cb90aa..f042877 100644
--- a/include/rive/math/simd.hpp
+++ b/include/rive/math/simd.hpp
@@ -17,8 +17,8 @@
#include "rive/rive_types.hpp"
#include <cassert>
#include <limits>
+#include <math.h>
#include <stdint.h>
-#include <string.h>
#include <tuple>
#include <type_traits>
@@ -176,6 +176,27 @@
#endif
}
+template <typename T, int N> RIVE_ALWAYS_INLINE T sum(gvec<T, N> x)
+{
+ T s = x[0];
+ for (int i = 1; i < N; ++i)
+ s += x[i];
+ return s;
+}
+
+// We can use __builtin_reduce_add for integer types.
+#if __has_builtin(__builtin_reduce_add)
+template <int N> RIVE_ALWAYS_INLINE int32_t sum(gvec<int32_t, N> x)
+{
+ return __builtin_reduce_add(x);
+}
+
+template <int N> RIVE_ALWAYS_INLINE uint32_t sum(gvec<uint32_t, N> x)
+{
+ return __builtin_reduce_add(x);
+}
+#endif
+
////// Floating Point Functions //////
template <int N> RIVE_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x)
@@ -289,6 +310,16 @@
return x * (numer / denom) + pi_over_2;
}
+////// Type conversion //////
+
+template <typename U, typename T, int N> RIVE_ALWAYS_INLINE gvec<U, N> cast(gvec<T, N> x)
+{
+ gvec<U, N> y{};
+ for (int i = 0; i < N; ++i)
+ y[i] = static_cast<U>(x[i]);
+ return y;
+}
+
////// Loading and storing //////
template <typename T, int N> RIVE_ALWAYS_INLINE gvec<T, N> load(const void* ptr)
@@ -309,6 +340,49 @@
RIVE_INLINE_MEMCPY(dst, &vec, sizeof(T) * N);
}
+////// Column-major (transposed) loads //////
+
+#ifdef __ARM_NEON__
+RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+load4x4f(const float* matrix)
+{
+ float32x4x4_t m = vld4q_f32(matrix);
+ gvec<float, 4> c0, c1, c2, c3;
+ RIVE_INLINE_MEMCPY(&c0, &m.val[0], sizeof(c0));
+ RIVE_INLINE_MEMCPY(&c1, &m.val[1], sizeof(c1));
+ RIVE_INLINE_MEMCPY(&c2, &m.val[2], sizeof(c2));
+ RIVE_INLINE_MEMCPY(&c3, &m.val[3], sizeof(c3));
+ return {c0, c1, c2, c3};
+}
+#elif defined(__SSE__)
+RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+load4x4f(const float* m)
+{
+ __m128 r0, r1, r2, r3;
+ RIVE_INLINE_MEMCPY(&r0, m + 4 * 0, sizeof(r0));
+ RIVE_INLINE_MEMCPY(&r1, m + 4 * 1, sizeof(r1));
+ RIVE_INLINE_MEMCPY(&r2, m + 4 * 2, sizeof(r2));
+ RIVE_INLINE_MEMCPY(&r3, m + 4 * 3, sizeof(r3));
+ _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+ gvec<float, 4> c0, c1, c2, c3;
+ RIVE_INLINE_MEMCPY(&c0, &r0, sizeof(c0));
+ RIVE_INLINE_MEMCPY(&c1, &r1, sizeof(c1));
+ RIVE_INLINE_MEMCPY(&c2, &r2, sizeof(c2));
+ RIVE_INLINE_MEMCPY(&c3, &r3, sizeof(c3));
+ return {c0, c1, c2, c3};
+}
+#else
+RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+load4x4f(const float* m)
+{
+ gvec<float, 4> c0 = {m[0], m[4], m[8], m[12]};
+ gvec<float, 4> c1 = {m[1], m[5], m[9], m[13]};
+ gvec<float, 4> c2 = {m[2], m[6], m[10], m[14]};
+ gvec<float, 4> c3 = {m[3], m[7], m[11], m[15]};
+ return {c0, c1, c2, c3};
+}
+#endif
+
template <typename T, int M, int N>
RIVE_ALWAYS_INLINE gvec<T, M + N> join(gvec<T, M> a, gvec<T, N> b)
{
@@ -322,28 +396,9 @@
template <typename T, int N> RIVE_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
{
- auto d = a * b;
- T s = d[0];
- for (int i = 1; i < N; ++i)
- s += d[i];
- return s;
+ return sum(a * b);
}
-// We can use __builtin_reduce_add for integer types.
-#if __has_builtin(__builtin_reduce_add)
-template <int N> RIVE_ALWAYS_INLINE int32_t dot(gvec<int32_t, N> a, gvec<int32_t, N> b)
-{
- auto d = a * b;
- return __builtin_reduce_add(d);
-}
-
-template <int N> RIVE_ALWAYS_INLINE uint32_t dot(gvec<uint32_t, N> a, gvec<uint32_t, N> b)
-{
- auto d = a * b;
- return __builtin_reduce_add(d);
-}
-#endif
-
RIVE_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
{
auto c = a * b.yx;
@@ -366,8 +421,6 @@
} // namespace simd
} // namespace rive
-#undef RIVE_INLINE_MEMCPY
-
namespace rive
{
template <int N> using vec = simd::gvec<float, N>;
diff --git a/include/rive/math/simd_gvec_polyfill.hpp b/include/rive/math/simd_gvec_polyfill.hpp
index a590aab..2066060 100644
--- a/include/rive/math/simd_gvec_polyfill.hpp
+++ b/include/rive/math/simd_gvec_polyfill.hpp
@@ -206,6 +206,8 @@
DECL_ARITHMETIC_OP(|);
DECL_ARITHMETIC_OP(&);
DECL_ARITHMETIC_OP(^);
+DECL_ARITHMETIC_OP(<<);
+DECL_ARITHMETIC_OP(>>);
#undef DECL_ARITHMETIC_OP
@@ -249,6 +251,8 @@
{ \
return F((gvec<T, N>)x); \
}
+#define ENABLE_SWIZZLE_REDUCE(F) \
+ template <typename T, int N, Swizzle Z0> T F(gvec<T, N, Z0> x) { return F((gvec<T, N>)x); }
#define ENABLE_SWIZZLE1F(F) \
template <int N, Swizzle Z0> gvec<float, N> F(gvec<float, N, Z0> x) \
{ \
@@ -287,6 +291,7 @@
}
ENABLE_SWIZZLE1(abs)
+ENABLE_SWIZZLE_REDUCE(sum)
ENABLE_SWIZZLE1F(floor)
ENABLE_SWIZZLE1F(ceil)
ENABLE_SWIZZLE1F(sqrt)
@@ -302,8 +307,13 @@
{
store(dst, (gvec<T, N>)vec);
}
+template <typename U, typename T, int N, Swizzle Z> gvec<U, N> cast(gvec<T, N, Z> x)
+{
+ return cast<U>((gvec<T, N>)x);
+}
#undef ENABLE_SWIZZLE1
+#undef ENABLE_SWIZZLE_REDUCE
#undef ENABLE_SWIZZLE1F
#undef ENABLE_SWIZZLE1B
#undef ENABLE_SWIZZLEUT
diff --git a/include/rive/rive_types.hpp b/include/rive/rive_types.hpp
index 3450d76..f7d884b 100644
--- a/include/rive/rive_types.hpp
+++ b/include/rive/rive_types.hpp
@@ -62,13 +62,18 @@
// Annotations to assert unreachable control flow.
#if defined(__GNUC__) || defined(__clang__)
-#define RIVE_UNREACHABLE __builtin_unreachable
+#define RIVE_UNREACHABLE \
+ assert(!(bool)"unreachable reached"); \
+ __builtin_unreachable
#elif _MSC_VER
-#define RIVE_UNREACHABLE() __assume(0)
+#define RIVE_UNREACHABLE() \
+ assert(!(bool)"unreachable reached"); \
+ __assume(0)
#else
#define RIVE_UNREACHABLE() \
do \
{ \
+ assert(!(bool)"unreachable reached"); \
} while (0)
#endif
@@ -107,6 +112,12 @@
#define RIVE_INLINE_MEMCPY memcpy
#endif
+#ifdef DEBUG
+#define RIVE_DEBUG_CODE(CODE) CODE
+#else
+#define RIVE_DEBUG_CODE(CODE)
+#endif
+
// Backports of later stl functions.
namespace rivestd
{
diff --git a/include/rive/shapes/paint/color.hpp b/include/rive/shapes/paint/color.hpp
index 850fbb3..d83e7f0 100644
--- a/include/rive/shapes/paint/color.hpp
+++ b/include/rive/shapes/paint/color.hpp
@@ -17,6 +17,8 @@
unsigned int colorAlpha(ColorInt value);
+void UnpackColor4f(ColorInt color, float out[4]);
+
float colorOpacity(unsigned int value);
ColorInt colorWithAlpha(ColorInt value, unsigned int a);
diff --git a/src/shapes/paint/color.cpp b/src/shapes/paint/color.cpp
index 059d524..d2deb56 100644
--- a/src/shapes/paint/color.cpp
+++ b/src/shapes/paint/color.cpp
@@ -1,4 +1,6 @@
#include "rive/shapes/paint/color.hpp"
+
+#include "rive/math/simd.hpp"
#include <stdio.h>
namespace rive
@@ -17,6 +19,12 @@
unsigned int colorAlpha(ColorInt value) { return (0xff000000 & value) >> 24; }
+void UnpackColor4f(ColorInt color, float out[4])
+{
+ float4 color4f = simd::cast<float>(color << uint4{8, 16, 24, 0} >> 24u) / 255.f;
+ simd::store(out, color4f);
+}
+
float colorOpacity(ColorInt value) { return (float)colorAlpha(value) / 0xFF; }
ColorInt colorWithAlpha(ColorInt value, unsigned int a)
diff --git a/test/simd_test.cpp b/test/simd_test.cpp
index 2fff755..7a9d3db 100644
--- a/test/simd_test.cpp
+++ b/test/simd_test.cpp
@@ -329,6 +329,37 @@
int2{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::min()}));
}
+// Check simd::sum.
+TEST_CASE("sum", "[simd]")
+{
+ {
+ float4 v = {1, 2, 3, 4};
+ CHECK(simd::sum(v) == 10);
+ CHECK(simd::sum(v.zwxy) == 10);
+ CHECK(simd::sum(v.xyz) == 6);
+ CHECK(simd::sum(v.yz) == 5);
+ CHECK(simd::sum(v.xy.yxyx) == 6);
+ }
+
+ {
+ int4 v = {1, 2, 3, 4};
+ CHECK(simd::sum(v) == 10);
+ CHECK(simd::sum(v.zwxy) == 10);
+ CHECK(simd::sum(v.xyz) == 6);
+ CHECK(simd::sum(v.yz) == 5);
+ CHECK(simd::sum(v.xy.yxyx) == 6);
+ }
+
+ {
+ uint4 v = {1, 2, 3, 4};
+ CHECK(simd::sum(v) == 10);
+ CHECK(simd::sum(v.zwxy) == 10);
+ CHECK(simd::sum(v.xyz) == 6);
+ CHECK(simd::sum(v.yz) == 5);
+ CHECK(simd::sum(v.xy.yxyx) == 6);
+ }
+}
+
// Check simd::floor.
TEST_CASE("floor", "[simd]")
{
@@ -467,6 +498,16 @@
}
}
+TEST_CASE("cast", "[simd]")
+{
+ float4 f4 = float4{-1.9f, -1.5f, 1.5f, 1.1f};
+ CHECK(simd::all(simd::cast<int>(f4) == int4{-1, -1, 1, 1}));
+ CHECK(simd::all(simd::cast<int>(simd::floor(f4)) == int4{-2, -2, 1, 1}));
+ CHECK(simd::all(simd::cast<int>(simd::ceil(f4)) == int4{-1, -1, 2, 2}));
+ CHECK(simd::all(simd::cast<int>(simd::ceil(f4.zwxy)) == int4{2, 2, -1, -1}));
+ CHECK(simd::all(simd::cast<int>(simd::ceil(f4).zwxy) == int4{2, 2, -1, -1}));
+}
+
// Check simd::dot.
TEST_CASE("dot", "[simd]")
{
@@ -555,4 +596,17 @@
check_mix<5>();
CHECK_ALL((simd::mix(float4{1, 2, 3, 4}, float4{5, 6, 7, 8}, float4(0)) == float4{1, 2, 3, 4}));
}
+
+// Check simd::load4x4f
+TEST_CASE("load4x4f", "[simd]")
+{
+ // Column major.
+ float m[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
+ auto c = simd::load4x4f(m);
+ CHECK(simd::all(std::get<0>(c) == float4{0, 1, 2, 3}));
+ CHECK(simd::all(std::get<1>(c) == float4{4, 5, 6, 7}));
+ CHECK(simd::all(std::get<2>(c) == float4{8, 9, 10, 11}));
+ CHECK(simd::all(std::get<3>(c) == float4{12, 13, 14, 15}));
+}
+
} // namespace rive