More SIMD features

- Extract a standalone simd::sum() function from simd::dot().
- Add column-major (transposed) loads.
- Add a simd::cast function for type conversion.
- Add rive::UnpackColor4f(), which uses simd to convert a ColorInt to 4 normalized floats.
- Bonus: Add an assert to RIVE_UNREACHABLE and add a RIVE_DEBUG_CODE() macro.

Diffs=
c8b5fdadd More SIMD features
diff --git a/.rive_head b/.rive_head
index ecf6ff9..4758ad4 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-87f079a103eb417091b94ea5ed7eecd90cbc6149
+c8b5fdadd370d61267c0c6352b609f3608f3efd5
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp
index 9cb90aa..f042877 100644
--- a/include/rive/math/simd.hpp
+++ b/include/rive/math/simd.hpp
@@ -17,8 +17,8 @@
 #include "rive/rive_types.hpp"
 #include <cassert>
 #include <limits>
+#include <math.h>
 #include <stdint.h>
-#include <string.h>
 #include <tuple>
 #include <type_traits>
 
@@ -176,6 +176,27 @@
 #endif
 }
 
+template <typename T, int N> RIVE_ALWAYS_INLINE T sum(gvec<T, N> x)
+{
+    T s = x[0];
+    for (int i = 1; i < N; ++i)
+        s += x[i];
+    return s;
+}
+
+// We can use __builtin_reduce_add for integer types.
+#if __has_builtin(__builtin_reduce_add)
+template <int N> RIVE_ALWAYS_INLINE int32_t sum(gvec<int32_t, N> x)
+{
+    return __builtin_reduce_add(x);
+}
+
+template <int N> RIVE_ALWAYS_INLINE uint32_t sum(gvec<uint32_t, N> x)
+{
+    return __builtin_reduce_add(x);
+}
+#endif
+
 ////// Floating Point Functions //////
 
 template <int N> RIVE_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x)
@@ -289,6 +310,16 @@
     return x * (numer / denom) + pi_over_2;
 }
 
+////// Type conversion //////
+
+template <typename U, typename T, int N> RIVE_ALWAYS_INLINE gvec<U, N> cast(gvec<T, N> x)
+{
+    gvec<U, N> y{};
+    for (int i = 0; i < N; ++i)
+        y[i] = static_cast<U>(x[i]);
+    return y;
+}
+
 ////// Loading and storing //////
 
 template <typename T, int N> RIVE_ALWAYS_INLINE gvec<T, N> load(const void* ptr)
@@ -309,6 +340,49 @@
     RIVE_INLINE_MEMCPY(dst, &vec, sizeof(T) * N);
 }
 
+////// Column-major (transposed) loads //////
+
+#ifdef __ARM_NEON__
+RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+load4x4f(const float* matrix)
+{
+    float32x4x4_t m = vld4q_f32(matrix);
+    gvec<float, 4> c0, c1, c2, c3;
+    RIVE_INLINE_MEMCPY(&c0, &m.val[0], sizeof(c0));
+    RIVE_INLINE_MEMCPY(&c1, &m.val[1], sizeof(c1));
+    RIVE_INLINE_MEMCPY(&c2, &m.val[2], sizeof(c2));
+    RIVE_INLINE_MEMCPY(&c3, &m.val[3], sizeof(c3));
+    return {c0, c1, c2, c3};
+}
+#elif defined(__SSE__)
+RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+load4x4f(const float* m)
+{
+    __m128 r0, r1, r2, r3;
+    RIVE_INLINE_MEMCPY(&r0, m + 4 * 0, sizeof(r0));
+    RIVE_INLINE_MEMCPY(&r1, m + 4 * 1, sizeof(r1));
+    RIVE_INLINE_MEMCPY(&r2, m + 4 * 2, sizeof(r2));
+    RIVE_INLINE_MEMCPY(&r3, m + 4 * 3, sizeof(r3));
+    _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+    gvec<float, 4> c0, c1, c2, c3;
+    RIVE_INLINE_MEMCPY(&c0, &r0, sizeof(c0));
+    RIVE_INLINE_MEMCPY(&c1, &r1, sizeof(c1));
+    RIVE_INLINE_MEMCPY(&c2, &r2, sizeof(c2));
+    RIVE_INLINE_MEMCPY(&c3, &r3, sizeof(c3));
+    return {c0, c1, c2, c3};
+}
+#else
+RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+load4x4f(const float* m)
+{
+    gvec<float, 4> c0 = {m[0], m[4], m[8], m[12]};
+    gvec<float, 4> c1 = {m[1], m[5], m[9], m[13]};
+    gvec<float, 4> c2 = {m[2], m[6], m[10], m[14]};
+    gvec<float, 4> c3 = {m[3], m[7], m[11], m[15]};
+    return {c0, c1, c2, c3};
+}
+#endif
+
 template <typename T, int M, int N>
 RIVE_ALWAYS_INLINE gvec<T, M + N> join(gvec<T, M> a, gvec<T, N> b)
 {
@@ -322,28 +396,9 @@
 
 template <typename T, int N> RIVE_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
 {
-    auto d = a * b;
-    T s = d[0];
-    for (int i = 1; i < N; ++i)
-        s += d[i];
-    return s;
+    return sum(a * b);
 }
 
-// We can use __builtin_reduce_add for integer types.
-#if __has_builtin(__builtin_reduce_add)
-template <int N> RIVE_ALWAYS_INLINE int32_t dot(gvec<int32_t, N> a, gvec<int32_t, N> b)
-{
-    auto d = a * b;
-    return __builtin_reduce_add(d);
-}
-
-template <int N> RIVE_ALWAYS_INLINE uint32_t dot(gvec<uint32_t, N> a, gvec<uint32_t, N> b)
-{
-    auto d = a * b;
-    return __builtin_reduce_add(d);
-}
-#endif
-
 RIVE_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
 {
     auto c = a * b.yx;
@@ -366,8 +421,6 @@
 } // namespace simd
 } // namespace rive
 
-#undef RIVE_INLINE_MEMCPY
-
 namespace rive
 {
 template <int N> using vec = simd::gvec<float, N>;
diff --git a/include/rive/math/simd_gvec_polyfill.hpp b/include/rive/math/simd_gvec_polyfill.hpp
index a590aab..2066060 100644
--- a/include/rive/math/simd_gvec_polyfill.hpp
+++ b/include/rive/math/simd_gvec_polyfill.hpp
@@ -206,6 +206,8 @@
 DECL_ARITHMETIC_OP(|);
 DECL_ARITHMETIC_OP(&);
 DECL_ARITHMETIC_OP(^);
+DECL_ARITHMETIC_OP(<<);
+DECL_ARITHMETIC_OP(>>);
 
 #undef DECL_ARITHMETIC_OP
 
@@ -249,6 +251,8 @@
     {                                                                                              \
         return F((gvec<T, N>)x);                                                                   \
     }
+#define ENABLE_SWIZZLE_REDUCE(F)                                                                   \
+    template <typename T, int N, Swizzle Z0> T F(gvec<T, N, Z0> x) { return F((gvec<T, N>)x); }
 #define ENABLE_SWIZZLE1F(F)                                                                        \
     template <int N, Swizzle Z0> gvec<float, N> F(gvec<float, N, Z0> x)                            \
     {                                                                                              \
@@ -287,6 +291,7 @@
     }
 
 ENABLE_SWIZZLE1(abs)
+ENABLE_SWIZZLE_REDUCE(sum)
 ENABLE_SWIZZLE1F(floor)
 ENABLE_SWIZZLE1F(ceil)
 ENABLE_SWIZZLE1F(sqrt)
@@ -302,8 +307,13 @@
 {
     store(dst, (gvec<T, N>)vec);
 }
+template <typename U, typename T, int N, Swizzle Z> gvec<U, N> cast(gvec<T, N, Z> x)
+{
+    return cast<U>((gvec<T, N>)x);
+}
 
 #undef ENABLE_SWIZZLE1
+#undef ENABLE_SWIZZLE_REDUCE
 #undef ENABLE_SWIZZLE1F
 #undef ENABLE_SWIZZLE1B
 #undef ENABLE_SWIZZLEUT
diff --git a/include/rive/rive_types.hpp b/include/rive/rive_types.hpp
index 3450d76..f7d884b 100644
--- a/include/rive/rive_types.hpp
+++ b/include/rive/rive_types.hpp
@@ -62,13 +62,18 @@
 
 // Annotations to assert unreachable control flow.
 #if defined(__GNUC__) || defined(__clang__)
-#define RIVE_UNREACHABLE __builtin_unreachable
+#define RIVE_UNREACHABLE                                                                           \
+    assert(!(bool)"unreachable reached");                                                          \
+    __builtin_unreachable
 #elif _MSC_VER
-#define RIVE_UNREACHABLE() __assume(0)
+#define RIVE_UNREACHABLE()                                                                         \
+    assert(!(bool)"unreachable reached");                                                          \
+    __assume(0)
 #else
 #define RIVE_UNREACHABLE()                                                                         \
     do                                                                                             \
     {                                                                                              \
+        assert(!(bool)"unreachable reached");                                                      \
     } while (0)
 #endif
 
@@ -107,6 +112,12 @@
 #define RIVE_INLINE_MEMCPY memcpy
 #endif
 
+#ifdef DEBUG
+#define RIVE_DEBUG_CODE(CODE) CODE
+#else
+#define RIVE_DEBUG_CODE(CODE)
+#endif
+
 // Backports of later stl functions.
 namespace rivestd
 {
diff --git a/include/rive/shapes/paint/color.hpp b/include/rive/shapes/paint/color.hpp
index 850fbb3..d83e7f0 100644
--- a/include/rive/shapes/paint/color.hpp
+++ b/include/rive/shapes/paint/color.hpp
@@ -17,6 +17,8 @@
 
 unsigned int colorAlpha(ColorInt value);
 
+void UnpackColor4f(ColorInt color, float out[4]);
+
 float colorOpacity(unsigned int value);
 
 ColorInt colorWithAlpha(ColorInt value, unsigned int a);
diff --git a/src/shapes/paint/color.cpp b/src/shapes/paint/color.cpp
index 059d524..d2deb56 100644
--- a/src/shapes/paint/color.cpp
+++ b/src/shapes/paint/color.cpp
@@ -1,4 +1,6 @@
 #include "rive/shapes/paint/color.hpp"
+
+#include "rive/math/simd.hpp"
 #include <stdio.h>
 
 namespace rive
@@ -17,6 +19,12 @@
 
 unsigned int colorAlpha(ColorInt value) { return (0xff000000 & value) >> 24; }
 
+void UnpackColor4f(ColorInt color, float out[4])
+{
+    float4 color4f = simd::cast<float>(color << uint4{8, 16, 24, 0} >> 24u) / 255.f;
+    simd::store(out, color4f);
+}
+
 float colorOpacity(ColorInt value) { return (float)colorAlpha(value) / 0xFF; }
 
 ColorInt colorWithAlpha(ColorInt value, unsigned int a)
diff --git a/test/simd_test.cpp b/test/simd_test.cpp
index 2fff755..7a9d3db 100644
--- a/test/simd_test.cpp
+++ b/test/simd_test.cpp
@@ -329,6 +329,37 @@
                   int2{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::min()}));
 }
 
+// Check simd::sum.
+TEST_CASE("sum", "[simd]")
+{
+    {
+        float4 v = {1, 2, 3, 4};
+        CHECK(simd::sum(v) == 10);
+        CHECK(simd::sum(v.zwxy) == 10);
+        CHECK(simd::sum(v.xyz) == 6);
+        CHECK(simd::sum(v.yz) == 5);
+        CHECK(simd::sum(v.xy.yxyx) == 6);
+    }
+
+    {
+        int4 v = {1, 2, 3, 4};
+        CHECK(simd::sum(v) == 10);
+        CHECK(simd::sum(v.zwxy) == 10);
+        CHECK(simd::sum(v.xyz) == 6);
+        CHECK(simd::sum(v.yz) == 5);
+        CHECK(simd::sum(v.xy.yxyx) == 6);
+    }
+
+    {
+        uint4 v = {1, 2, 3, 4};
+        CHECK(simd::sum(v) == 10);
+        CHECK(simd::sum(v.zwxy) == 10);
+        CHECK(simd::sum(v.xyz) == 6);
+        CHECK(simd::sum(v.yz) == 5);
+        CHECK(simd::sum(v.xy.yxyx) == 6);
+    }
+}
+
 // Check simd::floor.
 TEST_CASE("floor", "[simd]")
 {
@@ -467,6 +498,16 @@
     }
 }
 
+TEST_CASE("cast", "[simd]")
+{
+    float4 f4 = float4{-1.9f, -1.5f, 1.5f, 1.1f};
+    CHECK(simd::all(simd::cast<int>(f4) == int4{-1, -1, 1, 1}));
+    CHECK(simd::all(simd::cast<int>(simd::floor(f4)) == int4{-2, -2, 1, 1}));
+    CHECK(simd::all(simd::cast<int>(simd::ceil(f4)) == int4{-1, -1, 2, 2}));
+    CHECK(simd::all(simd::cast<int>(simd::ceil(f4.zwxy)) == int4{2, 2, -1, -1}));
+    CHECK(simd::all(simd::cast<int>(simd::ceil(f4).zwxy) == int4{2, 2, -1, -1}));
+}
+
 // Check simd::dot.
 TEST_CASE("dot", "[simd]")
 {
@@ -555,4 +596,17 @@
     check_mix<5>();
     CHECK_ALL((simd::mix(float4{1, 2, 3, 4}, float4{5, 6, 7, 8}, float4(0)) == float4{1, 2, 3, 4}));
 }
+
+// Check simd::load4x4f
+TEST_CASE("load4x4f", "[simd]")
+{
+    // Column major.
+    float m[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
+    auto c = simd::load4x4f(m);
+    CHECK(simd::all(std::get<0>(c) == float4{0, 1, 2, 3}));
+    CHECK(simd::all(std::get<1>(c) == float4{4, 5, 6, 7}));
+    CHECK(simd::all(std::get<2>(c) == float4{8, 9, 10, 11}));
+    CHECK(simd::all(std::get<3>(c) == float4{12, 13, 14, 15}));
+}
+
 } // namespace rive