blob: dfc38fb2b79ce7d158201d19b1fb40e9ba1fcd83 [file] [log] [blame]
/*
* Copyright 2022 Rive
*/
// An SSE / NEON / WASM_SIMD library based on clang vector types.
//
// This header makes use of the clang vector builtins specified in https://reviews.llvm.org/D111529.
// This effort in clang is still a work in progress, and compiling this header requires an
// extremely recent version of clang.
//
// To explore the codegen from this header, paste it into https://godbolt.org/, select a recent
// clang compiler, and add an -O3 flag.
#ifndef _RIVE_SIMD_HPP_
#define _RIVE_SIMD_HPP_
#include <cassert>
#include <limits>
#include <stdint.h>
#define SIMD_ALWAYS_INLINE inline __attribute__((always_inline))
// SIMD math can expect conformant IEEE 754 behavior for NaN and Inf.
static_assert(std::numeric_limits<float>::is_iec559,
"Conformant IEEE 754 behavior for NaN and Inf is required.");
namespace rive
{
namespace simd
{
// The GLSL spec uses "gvec" to denote a vector of unspecified type.
template <typename T, int N>
using gvec = T __attribute__((ext_vector_type(N))) __attribute__((aligned(sizeof(T) * N)));
////// Boolean logic //////
//
// Vector booleans are of type int32_t, where true is ~0 and false is 0. Vector booleans can be
// generated using the builtin boolean operators: ==, !=, <=, >=, <, >
//
// Returns true if all elements in x are equal to 0.
template <int N> SIMD_ALWAYS_INLINE bool any(gvec<int32_t, N> x)
{
// This particular logic structure gets decent codegen in clang.
// TODO: __builtin_reduce_or(x) once it's implemented in the compiler.
for (int i = 0; i < N; ++i)
{
if (x[i])
return true;
}
return false;
}
// Returns true if all elements in x are equal to ~0.
template <int N> SIMD_ALWAYS_INLINE bool all(gvec<int32_t, N> x)
{
// In vector, true is represented by -1 exactly, so we use ~x for "not".
// TODO: __builtin_reduce_and(x) once it's implemented in the compiler.
return !any(~x);
}
template <typename T,
int N,
typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
SIMD_ALWAYS_INLINE gvec<int32_t, N> isnan(gvec<T, N> x)
{
return !(x == x);
}
template <typename T, int N, typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
constexpr gvec<int32_t, N> isnan(gvec<T, N>)
{
return {}; // Integer types are never NaN.
}
////// Math //////
// Similar to std::min(), with a noteworthy difference:
// If a[i] or b[i] is NaN and the other is not, returns whichever is _not_ NaN.
template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> min(gvec<T, N> a, gvec<T, N> b)
{
#if __has_builtin(__builtin_elementwise_min)
return __builtin_elementwise_min(a, b);
#else
#pragma message("performance: __builtin_elementwise_min() not supported. Consider updating clang.")
// Generate the same behavior for NaN as the SIMD builtins. (isnan() is a no-op for int types.)
return b < a || isnan(a) ? b : a;
#endif
}
// Similar to std::max(), with a noteworthy difference:
// If a[i] or b[i] is NaN and the other is not, returns whichever is _not_ NaN.
template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> max(gvec<T, N> a, gvec<T, N> b)
{
#if __has_builtin(__builtin_elementwise_max)
return __builtin_elementwise_max(a, b);
#else
#pragma message("performance: __builtin_elementwise_max() not supported. Consider updating clang.")
// Generate the same behavior for NaN as the SIMD builtins. (isnan() is a no-op for int types.)
return a < b || isnan(a) ? b : a;
#endif
}
// Unlike std::clamp(), simd::clamp() always returns a value between lo and hi.
//
// Returns lo if x == NaN, but std::clamp() returns NaN.
// Returns hi if hi <= lo.
// Ignores hi and/or lo if they are NaN.
//
template <typename T, int N>
SIMD_ALWAYS_INLINE gvec<T, N> clamp(gvec<T, N> x, gvec<T, N> lo, gvec<T, N> hi)
{
return min(max(lo, x), hi);
}
// Returns the absolute value of x per element, with one exception:
// If x[i] is an integer type and equal to the minimum representable value, returns x[i].
template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> abs(gvec<T, N> x)
{
#if __has_builtin(__builtin_elementwise_abs)
return __builtin_elementwise_abs(x);
#else
#pragma message("performance: __builtin_elementwise_abs() not supported. Consider updating clang.")
return x < 0 ? -x : x; // But the negation in the "true" side so we never negate NaN.
#endif
}
////// Loading and storing //////
template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> load(const void* ptr)
{
gvec<T, N> ret;
__builtin_memcpy(&ret, ptr, sizeof(T) * N);
return ret;
}
SIMD_ALWAYS_INLINE gvec<float, 2> load2f(const void* ptr) { return load<float, 2>(ptr); }
SIMD_ALWAYS_INLINE gvec<float, 4> load4f(const void* ptr) { return load<float, 4>(ptr); }
SIMD_ALWAYS_INLINE gvec<int32_t, 2> load2i(const void* ptr) { return load<int32_t, 2>(ptr); }
SIMD_ALWAYS_INLINE gvec<int32_t, 4> load4i(const void* ptr) { return load<int32_t, 4>(ptr); }
SIMD_ALWAYS_INLINE gvec<uint32_t, 2> load2ui(const void* ptr) { return load<uint32_t, 2>(ptr); }
SIMD_ALWAYS_INLINE gvec<uint32_t, 4> load4ui(const void* ptr) { return load<uint32_t, 4>(ptr); }
template <typename T, int N> SIMD_ALWAYS_INLINE void store(void* ptr, gvec<T, N> vec)
{
__builtin_memcpy(ptr, &vec, sizeof(T) * N);
}
template <typename T, int M, int N>
SIMD_ALWAYS_INLINE gvec<T, M + N> join(gvec<T, M> a, gvec<T, N> b)
{
T data[M + N];
__builtin_memcpy(data, &a, sizeof(T) * M);
__builtin_memcpy(data + M, &b, sizeof(T) * N);
return load<T, M + N>(data);
}
////// Basic linear algebra //////
template <typename T, int N> SIMD_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
{
gvec<T, N> d = a * b;
if constexpr (N == 2)
{
return d.x + d.y;
}
else if constexpr (N == 3)
{
return d.x + d.y + d.z;
}
else if constexpr (N == 4)
{
return d.x + d.y + d.z + d.w;
}
else
{
T s = d[0];
for (int i = 1; i < N; ++i)
{
s += d[i];
}
return s;
}
}
SIMD_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
{
gvec<float, 2> c = a * b.yx;
return c.x - c.y;
}
// Linearly interpolates between a and b.
//
// NOTE: mix(a, b, 1) !== b (!!)
//
// The floating point numerics are not precise in the case where t === 1. But overall, this
// structure seems to get better precision for things like chopping cubics on exact cusp points than
// "a*(1 - t) + b*t" (which would return exactly b when t == 1).
template <int N> SIMD_ALWAYS_INLINE gvec<float, N> mix(gvec<float, N> a, gvec<float, N> b, float t)
{
assert(0 <= t && t < 1);
return (b - a) * t + a;
}
template <int N>
SIMD_ALWAYS_INLINE gvec<float, N> mix(gvec<float, N> a, gvec<float, N> b, gvec<float, N> t)
{
assert(simd::all(0 <= t && t < 1));
return (b - a) * t + a;
}
} // namespace simd
} // namespace rive
#undef SIMD_ALWAYS_INLINE
namespace rive
{
template <int N> using vec = simd::gvec<float, N>;
using float2 = vec<2>;
using float4 = vec<4>;
template <int N> using ivec = simd::gvec<int32_t, N>;
using int2 = ivec<2>;
using int4 = ivec<4>;
template <int N> using uvec = simd::gvec<uint32_t, N>;
using uint2 = uvec<2>;
using uint4 = uvec<4>;
} // namespace rive
#endif