include/private/SkVx.h - skia - Git at Google

 /*
  * Copyright 2019 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #ifndef SKVX_DEFINED
 #define SKVX_DEFINED

 // skvx::Vec<N,T> are SIMD vectors of N T's, a v1.5 successor to SkNx<N,T>.
 //
 // This time we're leaning a bit less on platform-specific intrinsics and a bit
 // more on Clang/GCC vector extensions, but still keeping the option open to
 // drop in platform-specific intrinsics, actually more easily than before.
 //
 // We've also fixed a few of the caveats that used to make SkNx awkward to work
 // with across translation units.  skvx::Vec<N,T> always has N*sizeof(T) size
 // and alignof(T) alignment and is safe to use across translation units freely.


 // It'd be nice to not pull in any Skia headers here, in case we want to spin this file off.
 #include <algorithm>         // std::accumulate, std::copy, std::fill, std::transform, etc.
 #include <cstdint>           // intXX_t
 #include <cstring>           // memcpy()
 #include <cmath>             // std::ceil, std::floor, std::trunc, std::round, std::sqrt, etc.
 #include <functional>        // std::plus, std::minus, std::multiplies, etc.
 #include <initializer_list>  // std::initializer_list

 // We try to use <algorithm> and <functional> where natural so that the more
 // idiosyncratic parts that can't use them stand out.  This is an experiment.

 namespace skvx {

 // All Vec have the same simple memory layout, the same as `T vec[N]`.
 // This gives Vec a consistent ABI, letting them pass between files compiled with
 // different instruction sets (e.g. SSE2 and AVX2) without fear of ODR violation.
 template <int N, typename T>
 struct Vec {
     static_assert((N & (N-1)) == 0, "N must be a power of 2.");

     T vals[N];

     // Methods belong here in the class declaration of Vec only if:
     //   - they must be here, like constructors or operator[];
     //   - they'll definitely never want a specialized implementation.
     // Other operations on Vec should be defined outside the type.

     Vec() = default;

     Vec(T x) { std::fill(vals,vals+N, x); }

     Vec(std::initializer_list<T> xs) : Vec(0) {
         std::copy(xs.begin(), xs.begin() + std::min(xs.size(), (size_t)N), vals);
     }

     T  operator[](int i) const { return vals[i]; }
     T& operator[](int i)       { return vals[i]; }

     static Vec Load(const void* ptr) {
         Vec v;
         memcpy(&v, ptr, sizeof(Vec));
         return v;
     }
     void store(void* ptr) const {
         memcpy(ptr, this, sizeof(Vec));
     }
 };


 #if defined(_MSC_VER)
     #define ALWAYS_INLINE __forceinline
 #else
     #define ALWAYS_INLINE __attribute__((always_inline))
 #endif

 // Helps tamp down on the repetitive boilerplate.
 #define ___ template <int N, typename T> static inline ALWAYS_INLINE

 #if defined(__GNUC__) && !defined(__clang__) && defined(__SSE__)
     // GCC warns about ABI changes when returning >= 32 byte vectors when -mavx is not enabled.
     // The functions that do that (BitPun::operator U() and to_vext()) are marked ALWAYS_INLINE,
     // so we can just stifle the warning.
     #pragma GCC diagnostic ignored "-Wpsabi"
 #endif

 // BitPun<V> holds a V and can implicitly bit-pun that V to any other equal sized type U.
 template <typename V>
 struct BitPun {
     V v;

     template <typename U>
     ALWAYS_INLINE operator U() const {
         static_assert(sizeof(U) == sizeof(V), "");
         U u;
         memcpy(&u, &v, sizeof(U));
         return u;
     }
 };
 template <typename V>
 static inline ALWAYS_INLINE BitPun<V> bit_pun(V v) { return {v}; }

 // Translate from a value type T to its corresponding Mask, the result of a comparison.
 template <typename T> struct MaskHelper { using type = T; };
 template <> struct MaskHelper<float > { using type = int32_t; };
 template <> struct MaskHelper<double> { using type = int64_t; };
 template <typename T> using Mask = typename MaskHelper<T>::type;


 // Apply op() to each lane of one or two input vectors, returning a new vector of the results.
 template <int N, typename T, typename Op>
 static inline auto map(Vec<N,T> x, Op op) -> Vec<N, decltype(op(x[0]))> {
     Vec<N, decltype(op(x[0]))> results;
     std::transform(x.vals, x.vals+N, results.vals, op);
     return results;
 }
 template <int N, typename T, typename Op>
 static inline auto map(Vec<N,T> x, Vec<N,T> y, Op op) -> Vec<N, decltype(op(x[0], y[0]))> {
     Vec<N, decltype(op(x[0], y[0]))> results;
     std::transform(x.vals, x.vals+N, y.vals, results.vals, op);
     return results;
 }


 // We have two default strategies for implementing most operations:
 //    1) lean on Clang/GCC vector extensions when available;
 //    2) fall back to portable implementations when not.
 // At the end we can drop in platform-specific implementations that override these defaults.

 #if !defined(SKNX_NO_SIMD) && (defined(__clang__) || defined(__GNUC__))

     // VExt<N,T> types have the same size as Vec<N,T> and support most operations directly.
     // N.B. VExt<N,T> alignment is N*alignof(T), stricter than Vec<N,T>'s alignof(T).

     #if defined(__clang__)
         template <int N, typename T>
         using VExt = T __attribute__((ext_vector_type(N)));

     #elif defined(__GNUC__)
         template <int N, typename T>
         struct VExtHelper {
             typedef T __attribute__((vector_size(N*sizeof(T)))) type;
         };

         template <int N, typename T>
         using VExt = typename VExtHelper<N,T>::type;
     #endif

     ___ VExt<N,T> to_vext(Vec<N,T> v) { return bit_pun(v); }

     ___ Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) + to_vext(y)); }
     ___ Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) - to_vext(y)); }
     ___ Vec<N,T> operator*(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) * to_vext(y)); }
     ___ Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) / to_vext(y)); }

     ___ Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) ^ to_vext(y)); }
     ___ Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) & to_vext(y)); }
     ___ Vec<N,T> operator|(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) | to_vext(y)); }

     ___ Vec<N,T> operator!(Vec<N,T> x) { return bit_pun(!to_vext(x)); }
     ___ Vec<N,T> operator-(Vec<N,T> x) { return bit_pun(-to_vext(x)); }
     ___ Vec<N,T> operator~(Vec<N,T> x) { return bit_pun(~to_vext(x)); }

     ___ Vec<N,T> operator<<(Vec<N,T> x, int bits) { return bit_pun(to_vext(x) << bits); }
     ___ Vec<N,T> operator>>(Vec<N,T> x, int bits) { return bit_pun(to_vext(x) >> bits); }

     ___ Vec<N, Mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) == to_vext(y)); }
     ___ Vec<N, Mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) != to_vext(y)); }
     ___ Vec<N, Mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) <= to_vext(y)); }
     ___ Vec<N, Mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) >= to_vext(y)); }
     ___ Vec<N, Mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) <  to_vext(y)); }
     ___ Vec<N, Mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) >  to_vext(y)); }

 #else

     // Either SKNX_NO_SIMD is defined, or Clang/GCC vector extensions are not available.
     // We'll implement things portably, in a way that should be easily autovectorizable.

     ___ Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::plus      <T>{}); }
     ___ Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::minus     <T>{}); }
     ___ Vec<N,T> operator*(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::multiplies<T>{}); }
     ___ Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::divides   <T>{}); }

     ___ Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::bit_xor<T>{}); }
     ___ Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::bit_and<T>{}); }
     ___ Vec<N,T> operator|(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::bit_or <T>{}); }

     ___ Vec<N,T> operator!(Vec<N,T> x) { return map(x, std::logical_not<T>{}); }
     ___ Vec<N,T> operator-(Vec<N,T> x) { return map(x, std::negate     <T>{}); }
     ___ Vec<N,T> operator~(Vec<N,T> x) { return map(x, std::bit_not    <T>{}); }

     ___ Vec<N,T> operator<<(Vec<N,T> x, int bits) { return map(x, [bits](T a) { return a << bits; }); }
     ___ Vec<N,T> operator>>(Vec<N,T> x, int bits) { return map(x, [bits](T a) { return a >> bits; }); }

     ___ Vec<N, Mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a == b ? ~0 : 0; }); }
     ___ Vec<N, Mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a != b ? ~0 : 0; }); }
     ___ Vec<N, Mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a <= b ? ~0 : 0; }); }
     ___ Vec<N, Mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a >= b ? ~0 : 0; }); }
     ___ Vec<N, Mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a <  b ? ~0 : 0; }); }
     ___ Vec<N, Mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a >  b ? ~0 : 0; }); }
 #endif

 // Some operations we want are not expressible with Clang/GCC vector extensions,
 // so we implement them using the same approach as the alternate path above.

 ___ Vec<N,T> if_then_else(Vec<N,Mask<T>> cond, Vec<N,T> t, Vec<N,T> e) {
     Vec<N,Mask<T>> t_bits = bit_pun(t),
                    e_bits = bit_pun(e);
     return bit_pun( (cond & t_bits) | (~cond & e_bits) );
 }

 ___ const T* begin(const Vec<N,T>& x) { return x.vals  ; }
 ___       T* begin(      Vec<N,T>& x) { return x.vals  ; }
 ___ const T*   end(const Vec<N,T>& x) { return x.vals+N; }
 ___       T*   end(      Vec<N,T>& x) { return x.vals+N; }

 ___ Vec<N,T> min(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) { return std::min(a,b); }); }
 ___ Vec<N,T> max(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) { return std::max(a,b); }); }

 // Scalar/vector operations just splat the scalar to a vector...
 ___ Vec<N,T>       operator+ (T x, Vec<N,T> y) { return Vec<N,T>(x) +  y; }
 ___ Vec<N,T>       operator- (T x, Vec<N,T> y) { return Vec<N,T>(x) -  y; }
 ___ Vec<N,T>       operator* (T x, Vec<N,T> y) { return Vec<N,T>(x) *  y; }
 ___ Vec<N,T>       operator/ (T x, Vec<N,T> y) { return Vec<N,T>(x) /  y; }
 ___ Vec<N,T>       operator^ (T x, Vec<N,T> y) { return Vec<N,T>(x) ^  y; }
 ___ Vec<N,T>       operator& (T x, Vec<N,T> y) { return Vec<N,T>(x) &  y; }
 ___ Vec<N,T>       operator| (T x, Vec<N,T> y) { return Vec<N,T>(x) |  y; }
 ___ Vec<N,Mask<T>> operator==(T x, Vec<N,T> y) { return Vec<N,T>(x) == y; }
 ___ Vec<N,Mask<T>> operator!=(T x, Vec<N,T> y) { return Vec<N,T>(x) != y; }
 ___ Vec<N,Mask<T>> operator<=(T x, Vec<N,T> y) { return Vec<N,T>(x) <= y; }
 ___ Vec<N,Mask<T>> operator>=(T x, Vec<N,T> y) { return Vec<N,T>(x) >= y; }
 ___ Vec<N,Mask<T>> operator< (T x, Vec<N,T> y) { return Vec<N,T>(x) <  y; }
 ___ Vec<N,Mask<T>> operator> (T x, Vec<N,T> y) { return Vec<N,T>(x) >  y; }
 ___ Vec<N,T>              min(T x, Vec<N,T> y) { return min(Vec<N,T>(x), y); }
 ___ Vec<N,T>              max(T x, Vec<N,T> y) { return max(Vec<N,T>(x), y); }
 // ... and same deal for vector/scalar operations.
 ___ Vec<N,T>       operator+ (Vec<N,T> x, T y) { return x +  Vec<N,T>(y); }
 ___ Vec<N,T>       operator- (Vec<N,T> x, T y) { return x -  Vec<N,T>(y); }
 ___ Vec<N,T>       operator* (Vec<N,T> x, T y) { return x *  Vec<N,T>(y); }
 ___ Vec<N,T>       operator/ (Vec<N,T> x, T y) { return x /  Vec<N,T>(y); }
 ___ Vec<N,T>       operator^ (Vec<N,T> x, T y) { return x ^  Vec<N,T>(y); }
 ___ Vec<N,T>       operator& (Vec<N,T> x, T y) { return x &  Vec<N,T>(y); }
 ___ Vec<N,T>       operator| (Vec<N,T> x, T y) { return x |  Vec<N,T>(y); }
 ___ Vec<N,Mask<T>> operator==(Vec<N,T> x, T y) { return x == Vec<N,T>(y); }
 ___ Vec<N,Mask<T>> operator!=(Vec<N,T> x, T y) { return x != Vec<N,T>(y); }
 ___ Vec<N,Mask<T>> operator<=(Vec<N,T> x, T y) { return x <= Vec<N,T>(y); }
 ___ Vec<N,Mask<T>> operator>=(Vec<N,T> x, T y) { return x >= Vec<N,T>(y); }
 ___ Vec<N,Mask<T>> operator< (Vec<N,T> x, T y) { return x <  Vec<N,T>(y); }
 ___ Vec<N,Mask<T>> operator> (Vec<N,T> x, T y) { return x >  Vec<N,T>(y); }
 ___ Vec<N,T>              min(Vec<N,T> x, T y) { return min(x, Vec<N,T>(y)); }
 ___ Vec<N,T>              max(Vec<N,T> x, T y) { return max(x, Vec<N,T>(y)); }

 // The various op= operators, for vectors...
 ___ Vec<N,T>& operator+=(Vec<N,T>& x, Vec<N,T> y) { return (x = x + y); }
 ___ Vec<N,T>& operator-=(Vec<N,T>& x, Vec<N,T> y) { return (x = x - y); }
 ___ Vec<N,T>& operator*=(Vec<N,T>& x, Vec<N,T> y) { return (x = x * y); }
 ___ Vec<N,T>& operator/=(Vec<N,T>& x, Vec<N,T> y) { return (x = x / y); }
 ___ Vec<N,T>& operator^=(Vec<N,T>& x, Vec<N,T> y) { return (x = x ^ y); }
 ___ Vec<N,T>& operator&=(Vec<N,T>& x, Vec<N,T> y) { return (x = x & y); }
 ___ Vec<N,T>& operator|=(Vec<N,T>& x, Vec<N,T> y) { return (x = x | y); }
 // ... for scalars...
 ___ Vec<N,T>& operator+=(Vec<N,T>& x, T y) { return (x = x + Vec<N,T>(y)); }
 ___ Vec<N,T>& operator-=(Vec<N,T>& x, T y) { return (x = x - Vec<N,T>(y)); }
 ___ Vec<N,T>& operator*=(Vec<N,T>& x, T y) { return (x = x * Vec<N,T>(y)); }
 ___ Vec<N,T>& operator/=(Vec<N,T>& x, T y) { return (x = x / Vec<N,T>(y)); }
 ___ Vec<N,T>& operator^=(Vec<N,T>& x, T y) { return (x = x ^ Vec<N,T>(y)); }
 ___ Vec<N,T>& operator&=(Vec<N,T>& x, T y) { return (x = x & Vec<N,T>(y)); }
 ___ Vec<N,T>& operator|=(Vec<N,T>& x, T y) { return (x = x | Vec<N,T>(y)); }
 // ... and for shifts.
 ___ Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); }
 ___ Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); }

 ___ Vec<N,T>  ceil(Vec<N,T> x) { return map(x, [](T a) { return std:: ceil(a); }); }
 ___ Vec<N,T> floor(Vec<N,T> x) { return map(x, [](T a) { return std::floor(a); }); }
 ___ Vec<N,T> trunc(Vec<N,T> x) { return map(x, [](T a) { return std::trunc(a); }); }
 ___ Vec<N,T> round(Vec<N,T> x) { return map(x, [](T a) { return std::round(a); }); }
 ___ Vec<N,T>  sqrt(Vec<N,T> x) { return map(x, [](T a) { return std:: sqrt(a); }); }

 ___ Vec<N,T>   abs(Vec<N,T> x) { return if_then_else(x < T(0), -x, x); }
 ___ Vec<N,T>   rcp(Vec<N,T> x) { return T(1) / x; }
 ___ Vec<N,T> rsqrt(Vec<N,T> x) { return rcp(sqrt(x)); }


 ___ T min(Vec<N,T> x) { return *std::min_element(x.vals, x.vals+N); }
 ___ T max(Vec<N,T> x) { return *std::max_element(x.vals, x.vals+N); }

 ___ bool any(Vec<N,T> x) { return std::any_of(x.vals, x.vals+N, [](T a) { return a != Mask<T>(0); }); }
 ___ bool all(Vec<N,T> x) { return std::all_of(x.vals, x.vals+N, [](T a) { return a != Mask<T>(0); }); }

 // Platform-specific specializations and overloads can now drop in here.

 }  // namespace skvx

 // Since cast() takes an extra template argument D (the type to cast to),
 // argument-dependent lookup won't let us just type cast<D>(...), instead
 // skvx::cast<D>(...).  That's annoying given how nice all the other methods
 // are, so we'll just move this guy outside into the global namespace.
 // That's pretty harmless... it still only works on skvx::Vec types.
 template <typename D, int N, typename S>
 static inline ALWAYS_INLINE skvx::Vec<N,D> cast(skvx::Vec<N,S> src) {
 #if !defined(SKNX_NO_SIMD) && defined(__clang__)
     return skvx::bit_pun(__builtin_convertvector(skvx::to_vext(src), skvx::VExt<N,D>));
 #else
     return skvx::map(src, [](S a) { return (D)a; });
 #endif
 }


 #undef ALWAYS_INLINE
 #undef ___

 #endif//SKVX_DEFINED
	/*
	* Copyright 2019 Google Inc.
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#ifndef SKVX_DEFINED
	#define SKVX_DEFINED

	// skvx::Vec<N,T> are SIMD vectors of N T's, a v1.5 successor to SkNx<N,T>.
	//
	// This time we're leaning a bit less on platform-specific intrinsics and a bit
	// more on Clang/GCC vector extensions, but still keeping the option open to
	// drop in platform-specific intrinsics, actually more easily than before.
	//
	// We've also fixed a few of the caveats that used to make SkNx awkward to work
	// with across translation units. skvx::Vec<N,T> always has N*sizeof(T) size
	// and alignof(T) alignment and is safe to use across translation units freely.


	// It'd be nice to not pull in any Skia headers here, in case we want to spin this file off.
	#include <algorithm> // std::accumulate, std::copy, std::fill, std::transform, etc.
	#include <cstdint> // intXX_t
	#include <cstring> // memcpy()
	#include <cmath> // std::ceil, std::floor, std::trunc, std::round, std::sqrt, etc.
	#include <functional> // std::plus, std::minus, std::multiplies, etc.
	#include <initializer_list> // std::initializer_list

	// We try to use <algorithm> and <functional> where natural so that the more
	// idiosyncratic parts that can't use them stand out. This is an experiment.

	namespace skvx {

	// All Vec have the same simple memory layout, the same as `T vec[N]`.
	// This gives Vec a consistent ABI, letting them pass between files compiled with
	// different instruction sets (e.g. SSE2 and AVX2) without fear of ODR violation.
	template <int N, typename T>
	struct Vec {
	static_assert((N & (N-1)) == 0, "N must be a power of 2.");

	T vals[N];

	// Methods belong here in the class declaration of Vec only if:
	// - they must be here, like constructors or operator[];
	// - they'll definitely never want a specialized implementation.
	// Other operations on Vec should be defined outside the type.

	Vec() = default;

	Vec(T x) { std::fill(vals,vals+N, x); }

	Vec(std::initializer_list<T> xs) : Vec(0) {
	std::copy(xs.begin(), xs.begin() + std::min(xs.size(), (size_t)N), vals);
	}

	T operator[](int i) const { return vals[i]; }
	T& operator[](int i) { return vals[i]; }

	static Vec Load(const void* ptr) {
	Vec v;
	memcpy(&v, ptr, sizeof(Vec));
	return v;
	}
	void store(void* ptr) const {
	memcpy(ptr, this, sizeof(Vec));
	}
	};


	#if defined(_MSC_VER)
	#define ALWAYS_INLINE __forceinline
	#else
	#define ALWAYS_INLINE __attribute__((always_inline))
	#endif

	// Helps tamp down on the repetitive boilerplate.
	#define ___ template <int N, typename T> static inline ALWAYS_INLINE

	#if defined(__GNUC__) && !defined(__clang__) && defined(__SSE__)
	// GCC warns about ABI changes when returning >= 32 byte vectors when -mavx is not enabled.
	// The functions that do that (BitPun::operator U() and to_vext()) are marked ALWAYS_INLINE,
	// so we can just stifle the warning.
	#pragma GCC diagnostic ignored "-Wpsabi"
	#endif

	// BitPun<V> holds a V and can implicitly bit-pun that V to any other equal sized type U.
	template <typename V>
	struct BitPun {
	V v;

	template <typename U>
	ALWAYS_INLINE operator U() const {
	static_assert(sizeof(U) == sizeof(V), "");
	U u;
	memcpy(&u, &v, sizeof(U));
	return u;
	}
	};
	template <typename V>
	static inline ALWAYS_INLINE BitPun<V> bit_pun(V v) { return {v}; }

	// Translate from a value type T to its corresponding Mask, the result of a comparison.
	template <typename T> struct MaskHelper { using type = T; };
	template <> struct MaskHelper<float > { using type = int32_t; };
	template <> struct MaskHelper<double> { using type = int64_t; };
	template <typename T> using Mask = typename MaskHelper<T>::type;


	// Apply op() to each lane of one or two input vectors, returning a new vector of the results.
	template <int N, typename T, typename Op>
	static inline auto map(Vec<N,T> x, Op op) -> Vec<N, decltype(op(x[0]))> {
	Vec<N, decltype(op(x[0]))> results;
	std::transform(x.vals, x.vals+N, results.vals, op);
	return results;
	}
	template <int N, typename T, typename Op>
	static inline auto map(Vec<N,T> x, Vec<N,T> y, Op op) -> Vec<N, decltype(op(x[0], y[0]))> {
	Vec<N, decltype(op(x[0], y[0]))> results;
	std::transform(x.vals, x.vals+N, y.vals, results.vals, op);
	return results;
	}


	// We have two default strategies for implementing most operations:
	// 1) lean on Clang/GCC vector extensions when available;
	// 2) fall back to portable implementations when not.
	// At the end we can drop in platform-specific implementations that override these defaults.

	#if !defined(SKNX_NO_SIMD) && (defined(__clang__) \|\| defined(__GNUC__))

	// VExt<N,T> types have the same size as Vec<N,T> and support most operations directly.
	// N.B. VExt<N,T> alignment is N*alignof(T), stricter than Vec<N,T>'s alignof(T).

	#if defined(__clang__)
	template <int N, typename T>
	using VExt = T __attribute__((ext_vector_type(N)));

	#elif defined(__GNUC__)
	template <int N, typename T>
	struct VExtHelper {
	typedef T __attribute__((vector_size(N*sizeof(T)))) type;
	};

	template <int N, typename T>
	using VExt = typename VExtHelper<N,T>::type;
	#endif

	___ VExt<N,T> to_vext(Vec<N,T> v) { return bit_pun(v); }

	___ Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) + to_vext(y)); }
	___ Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) - to_vext(y)); }
	___ Vec<N,T> operator(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) to_vext(y)); }
	___ Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) / to_vext(y)); }

	___ Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) ^ to_vext(y)); }
	___ Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) & to_vext(y)); }
	___ Vec<N,T> operator\|(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) \| to_vext(y)); }

	___ Vec<N,T> operator!(Vec<N,T> x) { return bit_pun(!to_vext(x)); }
	___ Vec<N,T> operator-(Vec<N,T> x) { return bit_pun(-to_vext(x)); }
	___ Vec<N,T> operator~(Vec<N,T> x) { return bit_pun(~to_vext(x)); }

	___ Vec<N,T> operator<<(Vec<N,T> x, int bits) { return bit_pun(to_vext(x) << bits); }
	___ Vec<N,T> operator>>(Vec<N,T> x, int bits) { return bit_pun(to_vext(x) >> bits); }

	___ Vec<N, Mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) == to_vext(y)); }
	___ Vec<N, Mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) != to_vext(y)); }
	___ Vec<N, Mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) <= to_vext(y)); }
	___ Vec<N, Mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) >= to_vext(y)); }
	___ Vec<N, Mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) < to_vext(y)); }
	___ Vec<N, Mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) > to_vext(y)); }

	#else

	// Either SKNX_NO_SIMD is defined, or Clang/GCC vector extensions are not available.
	// We'll implement things portably, in a way that should be easily autovectorizable.

	___ Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::plus <T>{}); }
	___ Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::minus <T>{}); }
	___ Vec<N,T> operator*(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::multiplies<T>{}); }
	___ Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::divides <T>{}); }

	___ Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::bit_xor<T>{}); }
	___ Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::bit_and<T>{}); }
	___ Vec<N,T> operator\|(Vec<N,T> x, Vec<N,T> y) { return map(x,y, std::bit_or <T>{}); }

	___ Vec<N,T> operator!(Vec<N,T> x) { return map(x, std::logical_not<T>{}); }
	___ Vec<N,T> operator-(Vec<N,T> x) { return map(x, std::negate <T>{}); }
	___ Vec<N,T> operator~(Vec<N,T> x) { return map(x, std::bit_not <T>{}); }

	___ Vec<N,T> operator<<(Vec<N,T> x, int bits) { return map(x, [bits](T a) { return a << bits; }); }
	___ Vec<N,T> operator>>(Vec<N,T> x, int bits) { return map(x, [bits](T a) { return a >> bits; }); }

	___ Vec<N, Mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a == b ? ~0 : 0; }); }
	___ Vec<N, Mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a != b ? ~0 : 0; }); }
	___ Vec<N, Mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a <= b ? ~0 : 0; }); }
	___ Vec<N, Mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a >= b ? ~0 : 0; }); }
	___ Vec<N, Mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a < b ? ~0 : 0; }); }
	___ Vec<N, Mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a > b ? ~0 : 0; }); }
	#endif

	// Some operations we want are not expressible with Clang/GCC vector extensions,
	// so we implement them using the same approach as the alternate path above.

	___ Vec<N,T> if_then_else(Vec<N,Mask<T>> cond, Vec<N,T> t, Vec<N,T> e) {
	Vec<N,Mask<T>> t_bits = bit_pun(t),
	e_bits = bit_pun(e);
	return bit_pun( (cond & t_bits) \| (~cond & e_bits) );
	}

	___ const T* begin(const Vec<N,T>& x) { return x.vals ; }
	___ T* begin( Vec<N,T>& x) { return x.vals ; }
	___ const T* end(const Vec<N,T>& x) { return x.vals+N; }
	___ T* end( Vec<N,T>& x) { return x.vals+N; }

	___ Vec<N,T> min(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) { return std::min(a,b); }); }
	___ Vec<N,T> max(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) { return std::max(a,b); }); }

	// Scalar/vector operations just splat the scalar to a vector...
	___ Vec<N,T> operator+ (T x, Vec<N,T> y) { return Vec<N,T>(x) + y; }
	___ Vec<N,T> operator- (T x, Vec<N,T> y) { return Vec<N,T>(x) - y; }
	___ Vec<N,T> operator* (T x, Vec<N,T> y) { return Vec<N,T>(x) * y; }
	___ Vec<N,T> operator/ (T x, Vec<N,T> y) { return Vec<N,T>(x) / y; }
	___ Vec<N,T> operator^ (T x, Vec<N,T> y) { return Vec<N,T>(x) ^ y; }
	___ Vec<N,T> operator& (T x, Vec<N,T> y) { return Vec<N,T>(x) & y; }
	___ Vec<N,T> operator\| (T x, Vec<N,T> y) { return Vec<N,T>(x) \| y; }
	___ Vec<N,Mask<T>> operator==(T x, Vec<N,T> y) { return Vec<N,T>(x) == y; }
	___ Vec<N,Mask<T>> operator!=(T x, Vec<N,T> y) { return Vec<N,T>(x) != y; }
	___ Vec<N,Mask<T>> operator<=(T x, Vec<N,T> y) { return Vec<N,T>(x) <= y; }
	___ Vec<N,Mask<T>> operator>=(T x, Vec<N,T> y) { return Vec<N,T>(x) >= y; }
	___ Vec<N,Mask<T>> operator< (T x, Vec<N,T> y) { return Vec<N,T>(x) < y; }
	___ Vec<N,Mask<T>> operator> (T x, Vec<N,T> y) { return Vec<N,T>(x) > y; }
	___ Vec<N,T> min(T x, Vec<N,T> y) { return min(Vec<N,T>(x), y); }
	___ Vec<N,T> max(T x, Vec<N,T> y) { return max(Vec<N,T>(x), y); }
	// ... and same deal for vector/scalar operations.
	___ Vec<N,T> operator+ (Vec<N,T> x, T y) { return x + Vec<N,T>(y); }
	___ Vec<N,T> operator- (Vec<N,T> x, T y) { return x - Vec<N,T>(y); }
	___ Vec<N,T> operator* (Vec<N,T> x, T y) { return x * Vec<N,T>(y); }
	___ Vec<N,T> operator/ (Vec<N,T> x, T y) { return x / Vec<N,T>(y); }
	___ Vec<N,T> operator^ (Vec<N,T> x, T y) { return x ^ Vec<N,T>(y); }
	___ Vec<N,T> operator& (Vec<N,T> x, T y) { return x & Vec<N,T>(y); }
	___ Vec<N,T> operator\| (Vec<N,T> x, T y) { return x \| Vec<N,T>(y); }
	___ Vec<N,Mask<T>> operator==(Vec<N,T> x, T y) { return x == Vec<N,T>(y); }
	___ Vec<N,Mask<T>> operator!=(Vec<N,T> x, T y) { return x != Vec<N,T>(y); }
	___ Vec<N,Mask<T>> operator<=(Vec<N,T> x, T y) { return x <= Vec<N,T>(y); }
	___ Vec<N,Mask<T>> operator>=(Vec<N,T> x, T y) { return x >= Vec<N,T>(y); }
	___ Vec<N,Mask<T>> operator< (Vec<N,T> x, T y) { return x < Vec<N,T>(y); }
	___ Vec<N,Mask<T>> operator> (Vec<N,T> x, T y) { return x > Vec<N,T>(y); }
	___ Vec<N,T> min(Vec<N,T> x, T y) { return min(x, Vec<N,T>(y)); }
	___ Vec<N,T> max(Vec<N,T> x, T y) { return max(x, Vec<N,T>(y)); }

	// The various op= operators, for vectors...
	___ Vec<N,T>& operator+=(Vec<N,T>& x, Vec<N,T> y) { return (x = x + y); }
	___ Vec<N,T>& operator-=(Vec<N,T>& x, Vec<N,T> y) { return (x = x - y); }
	___ Vec<N,T>& operator=(Vec<N,T>& x, Vec<N,T> y) { return (x = x y); }
	___ Vec<N,T>& operator/=(Vec<N,T>& x, Vec<N,T> y) { return (x = x / y); }
	___ Vec<N,T>& operator^=(Vec<N,T>& x, Vec<N,T> y) { return (x = x ^ y); }
	___ Vec<N,T>& operator&=(Vec<N,T>& x, Vec<N,T> y) { return (x = x & y); }
	___ Vec<N,T>& operator\|=(Vec<N,T>& x, Vec<N,T> y) { return (x = x \| y); }
	// ... for scalars...
	___ Vec<N,T>& operator+=(Vec<N,T>& x, T y) { return (x = x + Vec<N,T>(y)); }
	___ Vec<N,T>& operator-=(Vec<N,T>& x, T y) { return (x = x - Vec<N,T>(y)); }
	___ Vec<N,T>& operator=(Vec<N,T>& x, T y) { return (x = x Vec<N,T>(y)); }
	___ Vec<N,T>& operator/=(Vec<N,T>& x, T y) { return (x = x / Vec<N,T>(y)); }
	___ Vec<N,T>& operator^=(Vec<N,T>& x, T y) { return (x = x ^ Vec<N,T>(y)); }
	___ Vec<N,T>& operator&=(Vec<N,T>& x, T y) { return (x = x & Vec<N,T>(y)); }
	___ Vec<N,T>& operator\|=(Vec<N,T>& x, T y) { return (x = x \| Vec<N,T>(y)); }
	// ... and for shifts.
	___ Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); }
	___ Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); }

	___ Vec<N,T> ceil(Vec<N,T> x) { return map(x, [](T a) { return std:: ceil(a); }); }
	___ Vec<N,T> floor(Vec<N,T> x) { return map(x, [](T a) { return std::floor(a); }); }
	___ Vec<N,T> trunc(Vec<N,T> x) { return map(x, [](T a) { return std::trunc(a); }); }
	___ Vec<N,T> round(Vec<N,T> x) { return map(x, [](T a) { return std::round(a); }); }
	___ Vec<N,T> sqrt(Vec<N,T> x) { return map(x, [](T a) { return std:: sqrt(a); }); }

	___ Vec<N,T> abs(Vec<N,T> x) { return if_then_else(x < T(0), -x, x); }
	___ Vec<N,T> rcp(Vec<N,T> x) { return T(1) / x; }
	___ Vec<N,T> rsqrt(Vec<N,T> x) { return rcp(sqrt(x)); }


	___ T min(Vec<N,T> x) { return *std::min_element(x.vals, x.vals+N); }
	___ T max(Vec<N,T> x) { return *std::max_element(x.vals, x.vals+N); }

	___ bool any(Vec<N,T> x) { return std::any_of(x.vals, x.vals+N, [](T a) { return a != Mask<T>(0); }); }
	___ bool all(Vec<N,T> x) { return std::all_of(x.vals, x.vals+N, [](T a) { return a != Mask<T>(0); }); }

	// Platform-specific specializations and overloads can now drop in here.

	} // namespace skvx

	// Since cast() takes an extra template argument D (the type to cast to),
	// argument-dependent lookup won't let us just type cast<D>(...), instead
	// skvx::cast<D>(...). That's annoying given how nice all the other methods
	// are, so we'll just move this guy outside into the global namespace.
	// That's pretty harmless... it still only works on skvx::Vec types.
	template <typename D, int N, typename S>
	static inline ALWAYS_INLINE skvx::Vec<N,D> cast(skvx::Vec<N,S> src) {
	#if !defined(SKNX_NO_SIMD) && defined(__clang__)
	return skvx::bit_pun(__builtin_convertvector(skvx::to_vext(src), skvx::VExt<N,D>));
	#else
	return skvx::map(src, [](S a) { return (D)a; });
	#endif
	}


	#undef ALWAYS_INLINE
	#undef ___

	#endif//SKVX_DEFINED