Only use __builtin_reduce_add for integer types

The spec only defines this function for integer types:

https://clang.llvm.org/docs/LanguageExtensions.html

Fixes #4458

Diffs=
df91086ce Only __builtin_reduce_add for integer types
diff --git a/.rive_head b/.rive_head
index ef506ed..26d733e 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-4e221ab4f7d90ef3c6611d59ed8f989075431f5b
+df91086cea806be55fc82d13563318c2a0f4e4b1
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp
index 84341b3..4ca983c 100644
--- a/include/rive/math/simd.hpp
+++ b/include/rive/math/simd.hpp
@@ -308,21 +308,40 @@
 
 template <typename T, int N> SIMD_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
 {
-    gvec<T, N> d = a * b;
-#if __has_builtin(__builtin_reduce_add)
-    return __builtin_reduce_add(d);
-#else
-#pragma message("performance: __builtin_reduce_add() not supported. Consider updating clang.")
-    T s = d[0];
-    for (int i = 1; i < N; ++i)
-        s += d[i];
-    return s;
-#endif
+    auto d = a * b;
+    if constexpr (N == 2)
+        return d.x + d.y;
+    else if constexpr (N == 3)
+        return d.x + d.y + d.z;
+    else if constexpr (N == 4)
+        return d.x + d.y + d.z + d.w;
+    else
+    {
+        T s = d[0];
+        for (int i = 1; i < N; ++i)
+            s += d[i];
+        return s;
+    }
 }
 
+// We can use __builtin_reduce_add for integer types.
+#if __has_builtin(__builtin_reduce_add)
+template <int N> SIMD_ALWAYS_INLINE int32_t dot(gvec<int32_t, N> a, gvec<int32_t, N> b)
+{
+    auto d = a * b;
+    return __builtin_reduce_add(d);
+}
+
+template <int N> SIMD_ALWAYS_INLINE uint32_t dot(gvec<uint32_t, N> a, gvec<uint32_t, N> b)
+{
+    auto d = a * b;
+    return __builtin_reduce_add(d);
+}
+#endif
+
 SIMD_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
 {
-    gvec<float, 2> c = a * b.yx;
+    auto c = a * b.yx;
     return c.x - c.y;
 }
 
diff --git a/test/simd_test.cpp b/test/simd_test.cpp
index ed937ac..6ec5a99 100644
--- a/test/simd_test.cpp
+++ b/test/simd_test.cpp
@@ -351,13 +351,20 @@
 TEST_CASE("dot", "[simd]")
 {
     CHECK(simd::dot(int2{0, 1}, int2{1, 0}) == 0);
-    CHECK(simd::dot(int2{1, 0}, int2{0, 1}) == 0);
+    CHECK(simd::dot(uint2{1, 0}, uint2{0, 1}) == 0);
     CHECK(simd::dot(int2{1, 1}, int2{1, -1}) == 0);
-    CHECK(simd::dot(int2{1, 1}, int2{1, 1}) == 2);
+    CHECK(simd::dot(uint2{1, 1}, uint2{1, 1}) == 2);
     CHECK(simd::dot(int2{1, 1}, int2{-1, -1}) == -2);
-    CHECK(simd::dot(simd::gvec<int, 3>{1, 2, 3}, simd::gvec<int, 3>{1, 2, 3}) == 14);
+    CHECK(simd::dot(ivec<3>{1, 2, -3}, ivec<3>{1, 2, 3}) == -4);
+    CHECK(simd::dot(uvec<3>{1, 2, 3}, uvec<3>{1, 2, 3}) == 14);
     CHECK(simd::dot(int4{1, 2, 3, 4}, int4{1, 2, 3, 4}) == 30);
-    CHECK(simd::dot(ivec<5>{1, 2, 3, 4, 5}, ivec<5>{1, 2, 3, 4, 5}) == 55);
+    CHECK(simd::dot(ivec<5>{1, 2, 3, 4, 5}, ivec<5>{1, 2, 3, 4, -5}) == 5);
+    CHECK(simd::dot(uvec<5>{1, 2, 3, 4, 5}, uvec<5>{1, 2, 3, 4, 5}) == 55);
+
+    CHECK(simd::dot(float4{1, 2, 3, 4}, float4{4, 3, 2, 1}) == 20);
+    CHECK(simd::dot(vec<3>{1, 2, 3}, vec<3>{3, 2, 1}) == 10);
+    CHECK(simd::dot(float2{0, 1}, float2{1, 0}) == 0);
+    CHECK(simd::dot(vec<5>{1, 2, 3, 4, 5}, vec<5>{1, 2, 3, 4, 5}) == 55);
 }
 
 // Check simd::cross.