Add more features to simd.hpp
Add floor, ceil, sqrt, fast_acos.
Omit trunc because it isn't supported natively on WASM.
There isn't an elementwise builtin for sqrt, so drop to intrinsics.
fast_acos is a polynomial that approximates acos within ~1 degree.
Make use of "reduce" builtins.
Diffs=
362c448b3 Add more features to simd.hpp
diff --git a/.rive_head b/.rive_head
index 6e18a36..9bae16c 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-210e0ab148b628d05400073230bb70a65e787772
+362c448b3c6d63cf3fc89848819db446213e3155
diff --git a/include/rive/math/math_types.hpp b/include/rive/math/math_types.hpp
index 187b338..278e46a 100644
--- a/include/rive/math/math_types.hpp
+++ b/include/rive/math/math_types.hpp
@@ -14,6 +14,18 @@
namespace math
{
constexpr float PI = 3.14159265f;
+constexpr float EPSILON = 1.f / (1 << 12); // Common threshold for detecting values near zero.
+
+[[maybe_unused]] inline bool nearly_zero(float a, float tolerance = EPSILON)
+{
+ assert(tolerance >= 0);
+ return fabsf(a) <= tolerance;
+}
+
+[[maybe_unused]] inline bool nearly_equal(float a, float b, float tolerance = EPSILON)
+{
+ return nearly_zero(b - a, tolerance);
+}
// Performs a floating point division with conformant IEEE 754 behavior for NaN and Inf.
//
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp
index 3391c11..84341b3 100644
--- a/include/rive/math/simd.hpp
+++ b/include/rive/math/simd.hpp
@@ -5,8 +5,8 @@
// An SSE / NEON / WASM_SIMD library based on clang vector types.
//
// This header makes use of the clang vector builtins specified in https://reviews.llvm.org/D111529.
-// This effort in clang is still a work in progress, and compiling this header requires an
-// extremely recent version of clang.
+// This effort in clang is still a work in progress, so getting maximum performance from this header
+// requires an extremely recent version of clang.
//
// To explore the codegen from this header, paste it into https://godbolt.org/, select a recent
// clang compiler, and add an -O3 flag.
@@ -18,6 +18,14 @@
#include <limits>
#include <stdint.h>
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
#define SIMD_ALWAYS_INLINE inline __attribute__((always_inline))
// SIMD math can expect conformant IEEE 754 behavior for NaN and Inf.
@@ -46,22 +54,30 @@
// Returns true if all elements in x are equal to 0.
template <int N> SIMD_ALWAYS_INLINE bool any(gvec<int32_t, N> x)
{
+#if __has_builtin(__builtin_reduce_or)
+ return __builtin_reduce_or(x);
+#else
+#pragma message("performance: __builtin_reduce_or() not supported. Consider updating clang.")
// This particular logic structure gets decent codegen in clang.
- // TODO: __builtin_reduce_or(x) once it's implemented in the compiler.
for (int i = 0; i < N; ++i)
{
if (x[i])
return true;
}
return false;
+#endif
}
// Returns true if all elements in x are equal to ~0.
template <int N> SIMD_ALWAYS_INLINE bool all(gvec<int32_t, N> x)
{
+#if __has_builtin(__builtin_reduce_and)
+ return __builtin_reduce_and(x);
+#else
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
// In vector, true is represented by -1 exactly, so we use ~x for "not".
- // TODO: __builtin_reduce_and(x) once it's implemented in the compiler.
return !any(~x);
+#endif
}
template <typename T,
@@ -91,9 +107,7 @@
#pragma message("performance: vectorized '?:' operator not supported. Consider updating clang.")
gvec<T, N> ret{};
for (int i = 0; i < N; ++i)
- {
ret[i] = _if[i] ? _then[i] : _else[i];
- }
return ret;
#endif
}
@@ -148,6 +162,119 @@
#endif
}
+////// Floating Point Functions //////
+
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x)
+{
+#if __has_builtin(__builtin_elementwise_floor)
+ return __builtin_elementwise_floor(x);
+#else
+#pragma message( \
+ "performance: __builtin_elementwise_floor() not supported. Consider updating clang.")
+ for (int i = 0; i < N; ++i)
+ x[i] = floorf(x[i]);
+ return x;
+#endif
+}
+
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> ceil(gvec<float, N> x)
+{
+#if __has_builtin(__builtin_elementwise_ceil)
+ return __builtin_elementwise_ceil(x);
+#else
+#pragma message("performance: __builtin_elementwise_ceil() not supported. Consider updating clang.")
+ for (int i = 0; i < N; ++i)
+ x[i] = ceilf(x[i]);
+ return x;
+#endif
+}
+
+// IEEE compliant sqrt.
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> sqrt(gvec<float, N> x)
+{
+ // There isn't an elementwise builtin for sqrt. We define architecture-specific specializations
+ // of this function later.
+ for (int i = 0; i < N; ++i)
+ x[i] = sqrtf(x[i]);
+ return x;
+}
+
+#ifdef __SSE__
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+{
+ __m128 _x;
+ __builtin_memcpy(&_x, &x, sizeof(float) * 4);
+ _x = _mm_sqrt_ps(_x);
+ __builtin_memcpy(&x, &_x, sizeof(float) * 4);
+ return x;
+}
+
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+{
+ __m128 _x;
+ __builtin_memcpy(&_x, &x, sizeof(float) * 2);
+ _x = _mm_sqrt_ps(_x);
+ __builtin_memcpy(&x, &_x, sizeof(float) * 2);
+ return x;
+}
+#endif
+
+#ifdef __ARM_NEON__
+#ifdef __aarch64__
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+{
+ float32x4_t _x;
+ __builtin_memcpy(&_x, &x, sizeof(float) * 4);
+ _x = vsqrtq_f32(_x);
+ __builtin_memcpy(&x, &_x, sizeof(float) * 4);
+ return x;
+}
+
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+{
+ float32x2_t _x;
+ __builtin_memcpy(&_x, &x, sizeof(float) * 2);
+ _x = vsqrt_f32(_x);
+ __builtin_memcpy(&x, &_x, sizeof(float) * 2);
+ return x;
+}
+#endif
+#endif
+
+// This will only be present when building with Emscripten and "-msimd128".
+#if __has_builtin(__builtin_wasm_sqrt_f32x4)
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+{
+ return __builtin_wasm_sqrt_f32x4(x);
+}
+
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+{
+ gvec<float, 4> _x{x.x, x.y};
+ _x = __builtin_wasm_sqrt_f32x4(_x);
+ return _x.xy;
+}
+#endif
+
+// Approximates acos(x) within 0.96 degrees, using the rational polynomial:
+//
+// acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2
+//
+// See: https://stackoverflow.com/a/36387954
+#define SIMD_FAST_ACOS_MAX_ERROR 0.0167552f // .96 degrees
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> fast_acos(gvec<float, N> x)
+{
+ constexpr static float a = -0.939115566365855f;
+ constexpr static float b = 0.9217841528914573f;
+ constexpr static float c = -1.2845906244690837f;
+ constexpr static float d = 0.295624144969963174f;
+ constexpr static float pi_over_2 = 1.5707963267948966f;
+ auto xx = x * x;
+ auto numer = b * xx + a;
+ auto denom = xx * (d * xx + c) + 1;
+ return x * (numer / denom) + pi_over_2;
+}
+
////// Loading and storing //////
template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> load(const void* ptr)
@@ -182,27 +309,15 @@
template <typename T, int N> SIMD_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
{
gvec<T, N> d = a * b;
- if constexpr (N == 2)
- {
- return d.x + d.y;
- }
- else if constexpr (N == 3)
- {
- return d.x + d.y + d.z;
- }
- else if constexpr (N == 4)
- {
- return d.x + d.y + d.z + d.w;
- }
- else
- {
- T s = d[0];
- for (int i = 1; i < N; ++i)
- {
- s += d[i];
- }
- return s;
- }
+#if __has_builtin(__builtin_reduce_add)
+ return __builtin_reduce_add(d);
+#else
+#pragma message("performance: __builtin_reduce_add() not supported. Consider updating clang.")
+ T s = d[0];
+ for (int i = 1; i < N; ++i)
+ s += d[i];
+ return s;
+#endif
}
SIMD_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
diff --git a/test/simd_test.cpp b/test/simd_test.cpp
index 7517267..ed937ac 100644
--- a/test/simd_test.cpp
+++ b/test/simd_test.cpp
@@ -1,4 +1,11 @@
/*
+ * Copyright 2019 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * "fast_acos" test imported from skia:tests/SkVxTest.cpp
+ *
* Copyright 2022 Rive
*/
@@ -14,6 +21,44 @@
constexpr float kInf = std::numeric_limits<float>::infinity();
constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
+// Check simd::any.
+TEST_CASE("any", "[simd]")
+{
+ CHECK(!simd::any(int4{0, 0, 0, 0}));
+ CHECK(simd::any(int4{-1, 0, 0, 0}));
+ CHECK(simd::any(int4{0, -1, 0, 0}));
+ CHECK(simd::any(int4{0, 0, -1, 0}));
+ CHECK(simd::any(int4{0, 0, 0, -1}));
+ CHECK(!simd::any(ivec<3>{0, 0, 0}));
+ CHECK(simd::any(ivec<3>{-1, 0, 0}));
+ CHECK(simd::any(ivec<3>{0, -1, 0}));
+ CHECK(simd::any(ivec<3>{0, 0, -1}));
+ CHECK(!simd::any(int2{0, 0}));
+ CHECK(simd::any(int2{-1, 0}));
+ CHECK(simd::any(int2{0, -1}));
+ CHECK(!simd::any(ivec<1>{0}));
+ CHECK(simd::any(ivec<1>{-1}));
+}
+
+// Check simd::all.
+TEST_CASE("all", "[simd]")
+{
+ CHECK(simd::all(int4{-1, -1, -1, -1}));
+ CHECK(!simd::all(int4{0, -1, -1, -1}));
+ CHECK(!simd::all(int4{-1, 0, -1, -1}));
+ CHECK(!simd::all(int4{-1, -1, 0, -1}));
+ CHECK(!simd::all(int4{-1, -1, -1, 0}));
+ CHECK(simd::all(ivec<3>{-1, -1, -1}));
+ CHECK(!simd::all(ivec<3>{0, -1, -1}));
+ CHECK(!simd::all(ivec<3>{-1, 0, -1}));
+ CHECK(!simd::all(ivec<3>{-1, -1, 0}));
+ CHECK(simd::all(int2{-1, -1}));
+ CHECK(!simd::all(int2{0, -1}));
+ CHECK(!simd::all(int2{-1, 0}));
+ CHECK(simd::all(ivec<1>{-1}));
+ CHECK(!simd::all(ivec<1>{0}));
+}
+
// Verify the simd float types are IEEE 754 compliant for infinity and NaN.
TEST_CASE("ieee-compliance", "[simd]")
{
@@ -164,6 +209,144 @@
int2{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::min()}));
}
+// Check simd::floor.
+TEST_CASE("floor", "[simd]")
+{
+ CHECK(simd::all(simd::floor(float4{-1.9f, 1.9f, 2, -2}) == float4{-2, 1, 2, -2}));
+ CHECK(simd::all(simd::floor(float2{kInf, -kInf}) == float2{kInf, -kInf}));
+ CHECK(simd::all(simd::isnan(simd::floor(float2{kNaN, -kNaN}))));
+}
+
+// Check simd::ceil.
+TEST_CASE("ceil", "[simd]")
+{
+ CHECK(simd::all(simd::ceil(float4{-1.9f, 1.9f, 2, -2}) == float4{-1, 2, 2, -2}));
+ CHECK(simd::all(simd::ceil(float2{kInf, -kInf}) == float2{kInf, -kInf}));
+ CHECK(simd::all(simd::isnan(simd::ceil(float2{kNaN, -kNaN}))));
+}
+
+// Check simd::sqrt.
+TEST_CASE("sqrt", "[simd]")
+{
+ CHECK(simd::all(simd::sqrt(float4{1, 4, 9, 16}) == float4{1, 2, 3, 4}));
+ CHECK(simd::all(simd::sqrt(float2{25, 36}) == float2{5, 6}));
+ CHECK(simd::all(simd::sqrt(vec<1>{36}) == vec<1>{6}));
+ CHECK(simd::all(simd::sqrt(vec<5>{49, 64, 81, 100, 121}) == vec<5>{7, 8, 9, 10, 11}));
+ CHECK(simd::all(simd::isnan(simd::sqrt(float4{-1, -kInf, kNaN, -2}))));
+ CHECK(simd::all(simd::sqrt(vec<3>{kInf, 0, 1}) == vec<3>{kInf, 0, 1}));
+}
+
+static bool check_fast_acos(float x, float fast_acos_x)
+{
+ float acosf_x = acosf(x);
+ float error = acosf_x - fast_acos_x;
+ if (!(fabsf(error) <= SIMD_FAST_ACOS_MAX_ERROR))
+ {
+ auto rad2deg = [](float rad) { return rad * 180 / math::PI; };
+ fprintf(stderr,
+ "Larger-than-expected error from skvx::fast_acos\n"
+ " x= %f\n"
+ " fast_acos_x= %f (%f degrees\n"
+ " acosf_x= %f (%f degrees\n"
+ " error= %f (%f degrees)\n"
+ " tolerance= %f (%f degrees)\n\n",
+ x,
+ fast_acos_x,
+ rad2deg(fast_acos_x),
+ acosf_x,
+ rad2deg(acosf_x),
+ error,
+ rad2deg(error),
+ SIMD_FAST_ACOS_MAX_ERROR,
+ rad2deg(SIMD_FAST_ACOS_MAX_ERROR));
+ CHECK(false);
+ return false;
+ }
+ return true;
+}
+
+TEST_CASE("fast_acos", "[simd]")
+{
+ float4 boundaries = simd::fast_acos(float4{-1, 0, 1, 0});
+ check_fast_acos(-1, boundaries[0]);
+ check_fast_acos(0, boundaries[1]);
+ check_fast_acos(+1, boundaries[2]);
+
+ // Select a distribution of starting points around which to begin testing fast_acos. These
+ // fall roughly around the known minimum and maximum errors. No need to include -1, 0, or 1
+ // since those were just tested above. (Those are tricky because 0 is an inflection and the
+ // derivative is infinite at 1 and -1.)
+ using float8 = vec<8>;
+ float8 x = {-.99f, -.8f, -.4f, -.2f, .2f, .4f, .8f, .99f};
+
+ // Converge at the various local minima and maxima of "fast_acos(x) - cosf(x)" and verify that
+ // fast_acos is always within "kTolerance" degrees of the expected answer.
+ float8 err_;
+ for (int iter = 0; iter < 10; ++iter)
+ {
+ // Run our approximate inverse cosine approximation.
+ auto fast_acos_x = simd::fast_acos(x);
+
+ // Find d/dx(error)
+ // = d/dx(fast_acos(x) - acos(x))
+ // = (f'g - fg')/gg + 1/sqrt(1 - x^2), [where f = bx^3 + ax, g = dx^4 + cx^2 + 1]
+ float8 xx = x * x;
+ float8 a = -0.939115566365855f;
+ float8 b = 0.9217841528914573f;
+ float8 c = -1.2845906244690837f;
+ float8 d = 0.295624144969963174f;
+ float8 f = (b * xx + a) * x;
+ float8 f_ = 3 * b * xx + a;
+ float8 g = (d * xx + c) * xx + 1;
+ float8 g_ = (4 * d * xx + 2 * c) * x;
+ float8 gg = g * g;
+ float8 q = simd::sqrt(1 - xx);
+ err_ = (f_ * g - f * g_) / gg + 1 / q;
+
+ // Find d^2/dx^2(error)
+ // = ((f''g - fg'')g^2 - (f'g - fg')2gg') / g^4 + x(1 - x^2)^(-3/2)
+ // = ((f''g - fg'')g - (f'g - fg')2g') / g^3 + x(1 - x^2)^(-3/2)
+ float8 f__ = 6 * b * x;
+ float8 g__ = 12 * d * xx + 2 * c;
+ float8 err__ =
+ ((f__ * g - f * g__) * g - (f_ * g - f * g_) * 2 * g_) / (gg * g) + x / ((1 - xx) * q);
+
+#if 0
+ SkDebugf("\n\niter %i\n", iter);
+#endif
+ // Ensure each lane's approximation is within maximum error.
+ for (int j = 0; j < 8; ++j)
+ {
+#if 0
+ SkDebugf("x=%f err=%f err'=%f err''=%f\n",
+ x[j], rad2deg(skvx::fast_acos_x[j] - acosf(x[j])),
+ rad2deg(err_[j]), rad2deg(err__[j]));
+#endif
+ if (!check_fast_acos(x[j], fast_acos_x[j]))
+ {
+ return;
+ }
+ }
+
+ // Use Newton's method to update the x values to locations closer to their local minimum or
+ // maximum. (This is where d/dx(error) == 0.)
+ x -= err_ / err__;
+ x = simd::clamp<float, 8>(x, -.99f, .99f);
+ }
+
+ // Verify each lane converged to a local minimum or maximum.
+ for (int j = 0; j < 8; ++j)
+ {
+ REQUIRE(math::nearly_zero(err_[j]));
+ }
+
+ // Make sure we found all the actual known locations of local min/max error.
+ for (float knownRoot : {-0.983536f, -0.867381f, -0.410923f, 0.410923f, 0.867381f, 0.983536f})
+ {
+ CHECK(simd::any(simd::abs(x - knownRoot) < math::EPSILON));
+ }
+}
+
// Check simd::dot.
TEST_CASE("dot", "[simd]")
{