Add more features to simd.hpp

Add floor, ceil, sqrt, fast_acos.

Omit trunc because it isn't supported natively on WASM.

There isn't an elementwise builtin for sqrt, so drop to intrinsics.

fast_acos is a polynomial that approximates acos within ~1 degree.

Make use of "reduce" builtins.

Diffs=
362c448b3 Add more features to simd.hpp
diff --git a/.rive_head b/.rive_head
index 6e18a36..9bae16c 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-210e0ab148b628d05400073230bb70a65e787772
+362c448b3c6d63cf3fc89848819db446213e3155
diff --git a/include/rive/math/math_types.hpp b/include/rive/math/math_types.hpp
index 187b338..278e46a 100644
--- a/include/rive/math/math_types.hpp
+++ b/include/rive/math/math_types.hpp
@@ -14,6 +14,18 @@
 namespace math
 {
 constexpr float PI = 3.14159265f;
+constexpr float EPSILON = 1.f / (1 << 12); // Common threshold for detecting values near zero.
+
+[[maybe_unused]] inline bool nearly_zero(float a, float tolerance = EPSILON)
+{
+    assert(tolerance >= 0);
+    return fabsf(a) <= tolerance;
+}
+
+[[maybe_unused]] inline bool nearly_equal(float a, float b, float tolerance = EPSILON)
+{
+    return nearly_zero(b - a, tolerance);
+}
 
 // Performs a floating point division with conformant IEEE 754 behavior for NaN and Inf.
 //
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp
index 3391c11..84341b3 100644
--- a/include/rive/math/simd.hpp
+++ b/include/rive/math/simd.hpp
@@ -5,8 +5,8 @@
 // An SSE / NEON / WASM_SIMD library based on clang vector types.
 //
 // This header makes use of the clang vector builtins specified in https://reviews.llvm.org/D111529.
-// This effort in clang is still a work in progress, and compiling this header requires an
-// extremely recent version of clang.
+// This effort in clang is still a work in progress, so getting maximum performance from this header
+// requires an extremely recent version of clang.
 //
 // To explore the codegen from this header, paste it into https://godbolt.org/, select a recent
 // clang compiler, and add an -O3 flag.
@@ -18,6 +18,14 @@
 #include <limits>
 #include <stdint.h>
 
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
 #define SIMD_ALWAYS_INLINE inline __attribute__((always_inline))
 
 // SIMD math can expect conformant IEEE 754 behavior for NaN and Inf.
@@ -46,22 +54,30 @@
 // Returns true if all elements in x are equal to 0.
 template <int N> SIMD_ALWAYS_INLINE bool any(gvec<int32_t, N> x)
 {
+#if __has_builtin(__builtin_reduce_or)
+    return __builtin_reduce_or(x);
+#else
+#pragma message("performance: __builtin_reduce_or() not supported. Consider updating clang.")
     // This particular logic structure gets decent codegen in clang.
-    // TODO: __builtin_reduce_or(x) once it's implemented in the compiler.
     for (int i = 0; i < N; ++i)
     {
         if (x[i])
             return true;
     }
     return false;
+#endif
 }
 
 // Returns true if all elements in x are equal to ~0.
 template <int N> SIMD_ALWAYS_INLINE bool all(gvec<int32_t, N> x)
 {
+#if __has_builtin(__builtin_reduce_and)
+    return __builtin_reduce_and(x);
+#else
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
     // In vector, true is represented by -1 exactly, so we use ~x for "not".
-    // TODO: __builtin_reduce_and(x) once it's implemented in the compiler.
     return !any(~x);
+#endif
 }
 
 template <typename T,
@@ -91,9 +107,7 @@
 #pragma message("performance: vectorized '?:' operator not supported. Consider updating clang.")
     gvec<T, N> ret{};
     for (int i = 0; i < N; ++i)
-    {
         ret[i] = _if[i] ? _then[i] : _else[i];
-    }
     return ret;
 #endif
 }
@@ -148,6 +162,119 @@
 #endif
 }
 
+////// Floating Point Functions //////
+
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x)
+{
+#if __has_builtin(__builtin_elementwise_floor)
+    return __builtin_elementwise_floor(x);
+#else
+#pragma message(                                                                                   \
+    "performance: __builtin_elementwise_floor() not supported. Consider updating clang.")
+    for (int i = 0; i < N; ++i)
+        x[i] = floorf(x[i]);
+    return x;
+#endif
+}
+
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> ceil(gvec<float, N> x)
+{
+#if __has_builtin(__builtin_elementwise_ceil)
+    return __builtin_elementwise_ceil(x);
+#else
+#pragma message("performance: __builtin_elementwise_ceil() not supported. Consider updating clang.")
+    for (int i = 0; i < N; ++i)
+        x[i] = ceilf(x[i]);
+    return x;
+#endif
+}
+
+// IEEE compliant sqrt.
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> sqrt(gvec<float, N> x)
+{
+    // There isn't an elementwise builtin for sqrt. We define architecture-specific specializations
+    // of this function later.
+    for (int i = 0; i < N; ++i)
+        x[i] = sqrtf(x[i]);
+    return x;
+}
+
+#ifdef __SSE__
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+{
+    __m128 _x;
+    __builtin_memcpy(&_x, &x, sizeof(float) * 4);
+    _x = _mm_sqrt_ps(_x);
+    __builtin_memcpy(&x, &_x, sizeof(float) * 4);
+    return x;
+}
+
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+{
+    __m128 _x;
+    __builtin_memcpy(&_x, &x, sizeof(float) * 2);
+    _x = _mm_sqrt_ps(_x);
+    __builtin_memcpy(&x, &_x, sizeof(float) * 2);
+    return x;
+}
+#endif
+
+#ifdef __ARM_NEON__
+#ifdef __aarch64__
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+{
+    float32x4_t _x;
+    __builtin_memcpy(&_x, &x, sizeof(float) * 4);
+    _x = vsqrtq_f32(_x);
+    __builtin_memcpy(&x, &_x, sizeof(float) * 4);
+    return x;
+}
+
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+{
+    float32x2_t _x;
+    __builtin_memcpy(&_x, &x, sizeof(float) * 2);
+    _x = vsqrt_f32(_x);
+    __builtin_memcpy(&x, &_x, sizeof(float) * 2);
+    return x;
+}
+#endif
+#endif
+
+// This will only be present when building with Emscripten and "-msimd128".
+#if __has_builtin(__builtin_wasm_sqrt_f32x4)
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+{
+    return __builtin_wasm_sqrt_f32x4(x);
+}
+
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+{
+    gvec<float, 4> _x{x.x, x.y};
+    _x = __builtin_wasm_sqrt_f32x4(_x);
+    return _x.xy;
+}
+#endif
+
+// Approximates acos(x) within 0.96 degrees, using the rational polynomial:
+//
+//     acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2
+//
+// See: https://stackoverflow.com/a/36387954
+#define SIMD_FAST_ACOS_MAX_ERROR 0.0167552f // .96 degrees
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> fast_acos(gvec<float, N> x)
+{
+    constexpr static float a = -0.939115566365855f;
+    constexpr static float b = 0.9217841528914573f;
+    constexpr static float c = -1.2845906244690837f;
+    constexpr static float d = 0.295624144969963174f;
+    constexpr static float pi_over_2 = 1.5707963267948966f;
+    auto xx = x * x;
+    auto numer = b * xx + a;
+    auto denom = xx * (d * xx + c) + 1;
+    return x * (numer / denom) + pi_over_2;
+}
+
 ////// Loading and storing //////
 
 template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> load(const void* ptr)
@@ -182,27 +309,15 @@
 template <typename T, int N> SIMD_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
 {
     gvec<T, N> d = a * b;
-    if constexpr (N == 2)
-    {
-        return d.x + d.y;
-    }
-    else if constexpr (N == 3)
-    {
-        return d.x + d.y + d.z;
-    }
-    else if constexpr (N == 4)
-    {
-        return d.x + d.y + d.z + d.w;
-    }
-    else
-    {
-        T s = d[0];
-        for (int i = 1; i < N; ++i)
-        {
-            s += d[i];
-        }
-        return s;
-    }
+#if __has_builtin(__builtin_reduce_add)
+    return __builtin_reduce_add(d);
+#else
+#pragma message("performance: __builtin_reduce_add() not supported. Consider updating clang.")
+    T s = d[0];
+    for (int i = 1; i < N; ++i)
+        s += d[i];
+    return s;
+#endif
 }
 
 SIMD_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
diff --git a/test/simd_test.cpp b/test/simd_test.cpp
index 7517267..ed937ac 100644
--- a/test/simd_test.cpp
+++ b/test/simd_test.cpp
@@ -1,4 +1,11 @@
 /*
+ * Copyright 2019 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * "fast_acos" test imported from skia:tests/SkVxTest.cpp
+ *
  * Copyright 2022 Rive
  */
 
@@ -14,6 +21,44 @@
 constexpr float kInf = std::numeric_limits<float>::infinity();
 constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
 
+// Check simd::any.
+TEST_CASE("any", "[simd]")
+{
+    CHECK(!simd::any(int4{0, 0, 0, 0}));
+    CHECK(simd::any(int4{-1, 0, 0, 0}));
+    CHECK(simd::any(int4{0, -1, 0, 0}));
+    CHECK(simd::any(int4{0, 0, -1, 0}));
+    CHECK(simd::any(int4{0, 0, 0, -1}));
+    CHECK(!simd::any(ivec<3>{0, 0, 0}));
+    CHECK(simd::any(ivec<3>{-1, 0, 0}));
+    CHECK(simd::any(ivec<3>{0, -1, 0}));
+    CHECK(simd::any(ivec<3>{0, 0, -1}));
+    CHECK(!simd::any(int2{0, 0}));
+    CHECK(simd::any(int2{-1, 0}));
+    CHECK(simd::any(int2{0, -1}));
+    CHECK(!simd::any(ivec<1>{0}));
+    CHECK(simd::any(ivec<1>{-1}));
+}
+
+// Check simd::all.
+TEST_CASE("all", "[simd]")
+{
+    CHECK(simd::all(int4{-1, -1, -1, -1}));
+    CHECK(!simd::all(int4{0, -1, -1, -1}));
+    CHECK(!simd::all(int4{-1, 0, -1, -1}));
+    CHECK(!simd::all(int4{-1, -1, 0, -1}));
+    CHECK(!simd::all(int4{-1, -1, -1, 0}));
+    CHECK(simd::all(ivec<3>{-1, -1, -1}));
+    CHECK(!simd::all(ivec<3>{0, -1, -1}));
+    CHECK(!simd::all(ivec<3>{-1, 0, -1}));
+    CHECK(!simd::all(ivec<3>{-1, -1, 0}));
+    CHECK(simd::all(int2{-1, -1}));
+    CHECK(!simd::all(int2{0, -1}));
+    CHECK(!simd::all(int2{-1, 0}));
+    CHECK(simd::all(ivec<1>{-1}));
+    CHECK(!simd::all(ivec<1>{0}));
+}
+
 // Verify the simd float types are IEEE 754 compliant for infinity and NaN.
 TEST_CASE("ieee-compliance", "[simd]")
 {
@@ -164,6 +209,144 @@
                   int2{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::min()}));
 }
 
+// Check simd::floor.
+TEST_CASE("floor", "[simd]")
+{
+    CHECK(simd::all(simd::floor(float4{-1.9f, 1.9f, 2, -2}) == float4{-2, 1, 2, -2}));
+    CHECK(simd::all(simd::floor(float2{kInf, -kInf}) == float2{kInf, -kInf}));
+    CHECK(simd::all(simd::isnan(simd::floor(float2{kNaN, -kNaN}))));
+}
+
+// Check simd::ceil.
+TEST_CASE("ceil", "[simd]")
+{
+    CHECK(simd::all(simd::ceil(float4{-1.9f, 1.9f, 2, -2}) == float4{-1, 2, 2, -2}));
+    CHECK(simd::all(simd::ceil(float2{kInf, -kInf}) == float2{kInf, -kInf}));
+    CHECK(simd::all(simd::isnan(simd::ceil(float2{kNaN, -kNaN}))));
+}
+
+// Check simd::sqrt.
+TEST_CASE("sqrt", "[simd]")
+{
+    CHECK(simd::all(simd::sqrt(float4{1, 4, 9, 16}) == float4{1, 2, 3, 4}));
+    CHECK(simd::all(simd::sqrt(float2{25, 36}) == float2{5, 6}));
+    CHECK(simd::all(simd::sqrt(vec<1>{36}) == vec<1>{6}));
+    CHECK(simd::all(simd::sqrt(vec<5>{49, 64, 81, 100, 121}) == vec<5>{7, 8, 9, 10, 11}));
+    CHECK(simd::all(simd::isnan(simd::sqrt(float4{-1, -kInf, kNaN, -2}))));
+    CHECK(simd::all(simd::sqrt(vec<3>{kInf, 0, 1}) == vec<3>{kInf, 0, 1}));
+}
+
+static bool check_fast_acos(float x, float fast_acos_x)
+{
+    float acosf_x = acosf(x);
+    float error = acosf_x - fast_acos_x;
+    if (!(fabsf(error) <= SIMD_FAST_ACOS_MAX_ERROR))
+    {
+        auto rad2deg = [](float rad) { return rad * 180 / math::PI; };
+        fprintf(stderr,
+                "Larger-than-expected error from skvx::fast_acos\n"
+                "  x=              %f\n"
+                "  fast_acos_x=    %f  (%f degrees\n"
+                "  acosf_x=        %f  (%f degrees\n"
+                "  error=          %f  (%f degrees)\n"
+                "  tolerance=      %f  (%f degrees)\n\n",
+                x,
+                fast_acos_x,
+                rad2deg(fast_acos_x),
+                acosf_x,
+                rad2deg(acosf_x),
+                error,
+                rad2deg(error),
+                SIMD_FAST_ACOS_MAX_ERROR,
+                rad2deg(SIMD_FAST_ACOS_MAX_ERROR));
+        CHECK(false);
+        return false;
+    }
+    return true;
+}
+
+TEST_CASE("fast_acos", "[simd]")
+{
+    float4 boundaries = simd::fast_acos(float4{-1, 0, 1, 0});
+    check_fast_acos(-1, boundaries[0]);
+    check_fast_acos(0, boundaries[1]);
+    check_fast_acos(+1, boundaries[2]);
+
+    // Select a distribution of starting points around which to begin testing fast_acos. These
+    // fall roughly around the known minimum and maximum errors. No need to include -1, 0, or 1
+    // since those were just tested above. (Those are tricky because 0 is an inflection and the
+    // derivative is infinite at 1 and -1.)
+    using float8 = vec<8>;
+    float8 x = {-.99f, -.8f, -.4f, -.2f, .2f, .4f, .8f, .99f};
+
+    // Converge at the various local minima and maxima of "fast_acos(x) - cosf(x)" and verify that
+    // fast_acos is always within "kTolerance" degrees of the expected answer.
+    float8 err_;
+    for (int iter = 0; iter < 10; ++iter)
+    {
+        // Run our approximate inverse cosine approximation.
+        auto fast_acos_x = simd::fast_acos(x);
+
+        // Find d/dx(error)
+        //    = d/dx(fast_acos(x) - acos(x))
+        //    = (f'g - fg')/gg + 1/sqrt(1 - x^2), [where f = bx^3 + ax, g = dx^4 + cx^2 + 1]
+        float8 xx = x * x;
+        float8 a = -0.939115566365855f;
+        float8 b = 0.9217841528914573f;
+        float8 c = -1.2845906244690837f;
+        float8 d = 0.295624144969963174f;
+        float8 f = (b * xx + a) * x;
+        float8 f_ = 3 * b * xx + a;
+        float8 g = (d * xx + c) * xx + 1;
+        float8 g_ = (4 * d * xx + 2 * c) * x;
+        float8 gg = g * g;
+        float8 q = simd::sqrt(1 - xx);
+        err_ = (f_ * g - f * g_) / gg + 1 / q;
+
+        // Find d^2/dx^2(error)
+        //    = ((f''g - fg'')g^2 - (f'g - fg')2gg') / g^4 + x(1 - x^2)^(-3/2)
+        //    = ((f''g - fg'')g - (f'g - fg')2g') / g^3 + x(1 - x^2)^(-3/2)
+        float8 f__ = 6 * b * x;
+        float8 g__ = 12 * d * xx + 2 * c;
+        float8 err__ =
+            ((f__ * g - f * g__) * g - (f_ * g - f * g_) * 2 * g_) / (gg * g) + x / ((1 - xx) * q);
+
+#if 0
+        SkDebugf("\n\niter %i\n", iter);
+#endif
+        // Ensure each lane's approximation is within maximum error.
+        for (int j = 0; j < 8; ++j)
+        {
+#if 0
+            SkDebugf("x=%f  err=%f  err'=%f  err''=%f\n",
+                     x[j], rad2deg(skvx::fast_acos_x[j] - acosf(x[j])),
+                     rad2deg(err_[j]), rad2deg(err__[j]));
+#endif
+            if (!check_fast_acos(x[j], fast_acos_x[j]))
+            {
+                return;
+            }
+        }
+
+        // Use Newton's method to update the x values to locations closer to their local minimum or
+        // maximum. (This is where d/dx(error) == 0.)
+        x -= err_ / err__;
+        x = simd::clamp<float, 8>(x, -.99f, .99f);
+    }
+
+    // Verify each lane converged to a local minimum or maximum.
+    for (int j = 0; j < 8; ++j)
+    {
+        REQUIRE(math::nearly_zero(err_[j]));
+    }
+
+    // Make sure we found all the actual known locations of local min/max error.
+    for (float knownRoot : {-0.983536f, -0.867381f, -0.410923f, 0.410923f, 0.867381f, 0.983536f})
+    {
+        CHECK(simd::any(simd::abs(x - knownRoot) < math::EPSILON));
+    }
+}
+
 // Check simd::dot.
 TEST_CASE("dot", "[simd]")
 {