Add more features to simd.hpp Add floor, ceil, sqrt, fast_acos. Omit trunc because it isn't supported natively on WASM. There isn't an elementwise builtin for sqrt, so drop to intrinsics. fast_acos is a polynomial that approximates acos within ~1 degree. Make use of "reduce" builtins. Diffs= 362c448b3 Add more features to simd.hpp
diff --git a/.rive_head b/.rive_head index 6e18a36..9bae16c 100644 --- a/.rive_head +++ b/.rive_head
@@ -1 +1 @@ -210e0ab148b628d05400073230bb70a65e787772 +362c448b3c6d63cf3fc89848819db446213e3155
diff --git a/include/rive/math/math_types.hpp b/include/rive/math/math_types.hpp index 187b338..278e46a 100644 --- a/include/rive/math/math_types.hpp +++ b/include/rive/math/math_types.hpp
@@ -14,6 +14,18 @@ namespace math { constexpr float PI = 3.14159265f; +constexpr float EPSILON = 1.f / (1 << 12); // Common threshold for detecting values near zero. + +[[maybe_unused]] inline bool nearly_zero(float a, float tolerance = EPSILON) +{ + assert(tolerance >= 0); + return fabsf(a) <= tolerance; +} + +[[maybe_unused]] inline bool nearly_equal(float a, float b, float tolerance = EPSILON) +{ + return nearly_zero(b - a, tolerance); +} // Performs a floating point division with conformant IEEE 754 behavior for NaN and Inf. //
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp index 3391c11..84341b3 100644 --- a/include/rive/math/simd.hpp +++ b/include/rive/math/simd.hpp
@@ -5,8 +5,8 @@ // An SSE / NEON / WASM_SIMD library based on clang vector types. // // This header makes use of the clang vector builtins specified in https://reviews.llvm.org/D111529. -// This effort in clang is still a work in progress, and compiling this header requires an -// extremely recent version of clang. +// This effort in clang is still a work in progress, so getting maximum performance from this header +// requires an extremely recent version of clang. // // To explore the codegen from this header, paste it into https://godbolt.org/, select a recent // clang compiler, and add an -O3 flag. @@ -18,6 +18,14 @@ #include <limits> #include <stdint.h> +#ifdef __SSE__ +#include <immintrin.h> +#endif + +#ifdef __ARM_NEON__ +#include <arm_neon.h> +#endif + #define SIMD_ALWAYS_INLINE inline __attribute__((always_inline)) // SIMD math can expect conformant IEEE 754 behavior for NaN and Inf. @@ -46,22 +54,30 @@ // Returns true if all elements in x are equal to 0. template <int N> SIMD_ALWAYS_INLINE bool any(gvec<int32_t, N> x) { +#if __has_builtin(__builtin_reduce_or) + return __builtin_reduce_or(x); +#else +#pragma message("performance: __builtin_reduce_or() not supported. Consider updating clang.") // This particular logic structure gets decent codegen in clang. - // TODO: __builtin_reduce_or(x) once it's implemented in the compiler. for (int i = 0; i < N; ++i) { if (x[i]) return true; } return false; +#endif } // Returns true if all elements in x are equal to ~0. template <int N> SIMD_ALWAYS_INLINE bool all(gvec<int32_t, N> x) { +#if __has_builtin(__builtin_reduce_and) + return __builtin_reduce_and(x); +#else +#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.") // In vector, true is represented by -1 exactly, so we use ~x for "not". - // TODO: __builtin_reduce_and(x) once it's implemented in the compiler. return !any(~x); +#endif } template <typename T, @@ -91,9 +107,7 @@ #pragma message("performance: vectorized '?:' operator not supported. Consider updating clang.") gvec<T, N> ret{}; for (int i = 0; i < N; ++i) - { ret[i] = _if[i] ? _then[i] : _else[i]; - } return ret; #endif } @@ -148,6 +162,119 @@ #endif } +////// Floating Point Functions ////// + +template <int N> SIMD_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x) +{ +#if __has_builtin(__builtin_elementwise_floor) + return __builtin_elementwise_floor(x); +#else +#pragma message( \ + "performance: __builtin_elementwise_floor() not supported. Consider updating clang.") + for (int i = 0; i < N; ++i) + x[i] = floorf(x[i]); + return x; +#endif +} + +template <int N> SIMD_ALWAYS_INLINE gvec<float, N> ceil(gvec<float, N> x) +{ +#if __has_builtin(__builtin_elementwise_ceil) + return __builtin_elementwise_ceil(x); +#else +#pragma message("performance: __builtin_elementwise_ceil() not supported. Consider updating clang.") + for (int i = 0; i < N; ++i) + x[i] = ceilf(x[i]); + return x; +#endif +} + +// IEEE compliant sqrt. +template <int N> SIMD_ALWAYS_INLINE gvec<float, N> sqrt(gvec<float, N> x) +{ + // There isn't an elementwise builtin for sqrt. We define architecture-specific specializations + // of this function later. + for (int i = 0; i < N; ++i) + x[i] = sqrtf(x[i]); + return x; +} + +#ifdef __SSE__ +template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x) +{ + __m128 _x; + __builtin_memcpy(&_x, &x, sizeof(float) * 4); + _x = _mm_sqrt_ps(_x); + __builtin_memcpy(&x, &_x, sizeof(float) * 4); + return x; +} + +template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x) +{ + __m128 _x; + __builtin_memcpy(&_x, &x, sizeof(float) * 2); + _x = _mm_sqrt_ps(_x); + __builtin_memcpy(&x, &_x, sizeof(float) * 2); + return x; +} +#endif + +#ifdef __ARM_NEON__ +#ifdef __aarch64__ +template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x) +{ + float32x4_t _x; + __builtin_memcpy(&_x, &x, sizeof(float) * 4); + _x = vsqrtq_f32(_x); + __builtin_memcpy(&x, &_x, sizeof(float) * 4); + return x; +} + +template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x) +{ + float32x2_t _x; + __builtin_memcpy(&_x, &x, sizeof(float) * 2); + _x = vsqrt_f32(_x); + __builtin_memcpy(&x, &_x, sizeof(float) * 2); + return x; +} +#endif +#endif + +// This will only be present when building with Emscripten and "-msimd128". +#if __has_builtin(__builtin_wasm_sqrt_f32x4) +template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x) +{ + return __builtin_wasm_sqrt_f32x4(x); +} + +template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x) +{ + gvec<float, 4> _x{x.x, x.y}; + _x = __builtin_wasm_sqrt_f32x4(_x); + return _x.xy; +} +#endif + +// Approximates acos(x) within 0.96 degrees, using the rational polynomial: +// +// acos(x) ~= (bx^3 + ax) / (dx^4 + cx^2 + 1) + pi/2 +// +// See: https://stackoverflow.com/a/36387954 +#define SIMD_FAST_ACOS_MAX_ERROR 0.0167552f // .96 degrees +template <int N> SIMD_ALWAYS_INLINE gvec<float, N> fast_acos(gvec<float, N> x) +{ + constexpr static float a = -0.939115566365855f; + constexpr static float b = 0.9217841528914573f; + constexpr static float c = -1.2845906244690837f; + constexpr static float d = 0.295624144969963174f; + constexpr static float pi_over_2 = 1.5707963267948966f; + auto xx = x * x; + auto numer = b * xx + a; + auto denom = xx * (d * xx + c) + 1; + return x * (numer / denom) + pi_over_2; +} + ////// Loading and storing ////// template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> load(const void* ptr) @@ -182,27 +309,15 @@ template <typename T, int N> SIMD_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b) { gvec<T, N> d = a * b; - if constexpr (N == 2) - { - return d.x + d.y; - } - else if constexpr (N == 3) - { - return d.x + d.y + d.z; - } - else if constexpr (N == 4) - { - return d.x + d.y + d.z + d.w; - } - else - { - T s = d[0]; - for (int i = 1; i < N; ++i) - { - s += d[i]; - } - return s; - } +#if __has_builtin(__builtin_reduce_add) + return __builtin_reduce_add(d); +#else +#pragma message("performance: __builtin_reduce_add() not supported. Consider updating clang.") + T s = d[0]; + for (int i = 1; i < N; ++i) + s += d[i]; + return s; +#endif } SIMD_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
diff --git a/test/simd_test.cpp b/test/simd_test.cpp index 7517267..ed937ac 100644 --- a/test/simd_test.cpp +++ b/test/simd_test.cpp
@@ -1,4 +1,11 @@ /* + * Copyright 2019 Google Inc. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + * + * "fast_acos" test imported from skia:tests/SkVxTest.cpp + * * Copyright 2022 Rive */ @@ -14,6 +21,44 @@ constexpr float kInf = std::numeric_limits<float>::infinity(); constexpr float kNaN = std::numeric_limits<float>::quiet_NaN(); +// Check simd::any. +TEST_CASE("any", "[simd]") +{ + CHECK(!simd::any(int4{0, 0, 0, 0})); + CHECK(simd::any(int4{-1, 0, 0, 0})); + CHECK(simd::any(int4{0, -1, 0, 0})); + CHECK(simd::any(int4{0, 0, -1, 0})); + CHECK(simd::any(int4{0, 0, 0, -1})); + CHECK(!simd::any(ivec<3>{0, 0, 0})); + CHECK(simd::any(ivec<3>{-1, 0, 0})); + CHECK(simd::any(ivec<3>{0, -1, 0})); + CHECK(simd::any(ivec<3>{0, 0, -1})); + CHECK(!simd::any(int2{0, 0})); + CHECK(simd::any(int2{-1, 0})); + CHECK(simd::any(int2{0, -1})); + CHECK(!simd::any(ivec<1>{0})); + CHECK(simd::any(ivec<1>{-1})); +} + +// Check simd::all. +TEST_CASE("all", "[simd]") +{ + CHECK(simd::all(int4{-1, -1, -1, -1})); + CHECK(!simd::all(int4{0, -1, -1, -1})); + CHECK(!simd::all(int4{-1, 0, -1, -1})); + CHECK(!simd::all(int4{-1, -1, 0, -1})); + CHECK(!simd::all(int4{-1, -1, -1, 0})); + CHECK(simd::all(ivec<3>{-1, -1, -1})); + CHECK(!simd::all(ivec<3>{0, -1, -1})); + CHECK(!simd::all(ivec<3>{-1, 0, -1})); + CHECK(!simd::all(ivec<3>{-1, -1, 0})); + CHECK(simd::all(int2{-1, -1})); + CHECK(!simd::all(int2{0, -1})); + CHECK(!simd::all(int2{-1, 0})); + CHECK(simd::all(ivec<1>{-1})); + CHECK(!simd::all(ivec<1>{0})); +} + // Verify the simd float types are IEEE 754 compliant for infinity and NaN. TEST_CASE("ieee-compliance", "[simd]") { @@ -164,6 +209,144 @@ int2{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::min()})); } +// Check simd::floor. +TEST_CASE("floor", "[simd]") +{ + CHECK(simd::all(simd::floor(float4{-1.9f, 1.9f, 2, -2}) == float4{-2, 1, 2, -2})); + CHECK(simd::all(simd::floor(float2{kInf, -kInf}) == float2{kInf, -kInf})); + CHECK(simd::all(simd::isnan(simd::floor(float2{kNaN, -kNaN})))); +} + +// Check simd::ceil. +TEST_CASE("ceil", "[simd]") +{ + CHECK(simd::all(simd::ceil(float4{-1.9f, 1.9f, 2, -2}) == float4{-1, 2, 2, -2})); + CHECK(simd::all(simd::ceil(float2{kInf, -kInf}) == float2{kInf, -kInf})); + CHECK(simd::all(simd::isnan(simd::ceil(float2{kNaN, -kNaN})))); +} + +// Check simd::sqrt. +TEST_CASE("sqrt", "[simd]") +{ + CHECK(simd::all(simd::sqrt(float4{1, 4, 9, 16}) == float4{1, 2, 3, 4})); + CHECK(simd::all(simd::sqrt(float2{25, 36}) == float2{5, 6})); + CHECK(simd::all(simd::sqrt(vec<1>{36}) == vec<1>{6})); + CHECK(simd::all(simd::sqrt(vec<5>{49, 64, 81, 100, 121}) == vec<5>{7, 8, 9, 10, 11})); + CHECK(simd::all(simd::isnan(simd::sqrt(float4{-1, -kInf, kNaN, -2})))); + CHECK(simd::all(simd::sqrt(vec<3>{kInf, 0, 1}) == vec<3>{kInf, 0, 1})); +} + +static bool check_fast_acos(float x, float fast_acos_x) +{ + float acosf_x = acosf(x); + float error = acosf_x - fast_acos_x; + if (!(fabsf(error) <= SIMD_FAST_ACOS_MAX_ERROR)) + { + auto rad2deg = [](float rad) { return rad * 180 / math::PI; }; + fprintf(stderr, + "Larger-than-expected error from skvx::fast_acos\n" + " x= %f\n" + " fast_acos_x= %f (%f degrees\n" + " acosf_x= %f (%f degrees\n" + " error= %f (%f degrees)\n" + " tolerance= %f (%f degrees)\n\n", + x, + fast_acos_x, + rad2deg(fast_acos_x), + acosf_x, + rad2deg(acosf_x), + error, + rad2deg(error), + SIMD_FAST_ACOS_MAX_ERROR, + rad2deg(SIMD_FAST_ACOS_MAX_ERROR)); + CHECK(false); + return false; + } + return true; +} + +TEST_CASE("fast_acos", "[simd]") +{ + float4 boundaries = simd::fast_acos(float4{-1, 0, 1, 0}); + check_fast_acos(-1, boundaries[0]); + check_fast_acos(0, boundaries[1]); + check_fast_acos(+1, boundaries[2]); + + // Select a distribution of starting points around which to begin testing fast_acos. These + // fall roughly around the known minimum and maximum errors. No need to include -1, 0, or 1 + // since those were just tested above. (Those are tricky because 0 is an inflection and the + // derivative is infinite at 1 and -1.) + using float8 = vec<8>; + float8 x = {-.99f, -.8f, -.4f, -.2f, .2f, .4f, .8f, .99f}; + + // Converge at the various local minima and maxima of "fast_acos(x) - cosf(x)" and verify that + // fast_acos is always within "kTolerance" degrees of the expected answer. + float8 err_; + for (int iter = 0; iter < 10; ++iter) + { + // Run our approximate inverse cosine approximation. + auto fast_acos_x = simd::fast_acos(x); + + // Find d/dx(error) + // = d/dx(fast_acos(x) - acos(x)) + // = (f'g - fg')/gg + 1/sqrt(1 - x^2), [where f = bx^3 + ax, g = dx^4 + cx^2 + 1] + float8 xx = x * x; + float8 a = -0.939115566365855f; + float8 b = 0.9217841528914573f; + float8 c = -1.2845906244690837f; + float8 d = 0.295624144969963174f; + float8 f = (b * xx + a) * x; + float8 f_ = 3 * b * xx + a; + float8 g = (d * xx + c) * xx + 1; + float8 g_ = (4 * d * xx + 2 * c) * x; + float8 gg = g * g; + float8 q = simd::sqrt(1 - xx); + err_ = (f_ * g - f * g_) / gg + 1 / q; + + // Find d^2/dx^2(error) + // = ((f''g - fg'')g^2 - (f'g - fg')2gg') / g^4 + x(1 - x^2)^(-3/2) + // = ((f''g - fg'')g - (f'g - fg')2g') / g^3 + x(1 - x^2)^(-3/2) + float8 f__ = 6 * b * x; + float8 g__ = 12 * d * xx + 2 * c; + float8 err__ = + ((f__ * g - f * g__) * g - (f_ * g - f * g_) * 2 * g_) / (gg * g) + x / ((1 - xx) * q); + +#if 0 + SkDebugf("\n\niter %i\n", iter); +#endif + // Ensure each lane's approximation is within maximum error. + for (int j = 0; j < 8; ++j) + { +#if 0 + SkDebugf("x=%f err=%f err'=%f err''=%f\n", + x[j], rad2deg(skvx::fast_acos_x[j] - acosf(x[j])), + rad2deg(err_[j]), rad2deg(err__[j])); +#endif + if (!check_fast_acos(x[j], fast_acos_x[j])) + { + return; + } + } + + // Use Newton's method to update the x values to locations closer to their local minimum or + // maximum. (This is where d/dx(error) == 0.) + x -= err_ / err__; + x = simd::clamp<float, 8>(x, -.99f, .99f); + } + + // Verify each lane converged to a local minimum or maximum. + for (int j = 0; j < 8; ++j) + { + REQUIRE(math::nearly_zero(err_[j])); + } + + // Make sure we found all the actual known locations of local min/max error. + for (float knownRoot : {-0.983536f, -0.867381f, -0.410923f, 0.410923f, 0.867381f, 0.983536f}) + { + CHECK(simd::any(simd::abs(x - knownRoot) < math::EPSILON)); + } +} + // Check simd::dot. TEST_CASE("dot", "[simd]") {