fill in most remaining skvx operations

Obviously lots of these new operations like sqrt() will want platform
specialization.  That'll come later.

Change-Id: Ia0758425d4ec5911968a3d0ad63fa387b9b4cb39
Reviewed-on: https://skia-review.googlesource.com/c/189848
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Herb Derby <herb@google.com>
Auto-Submit: Mike Klein <mtklein@google.com>
diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index d233965..3b90523 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -23,6 +23,7 @@
 #include <algorithm>         // std::accumulate, std::copy, std::fill, std::transform, etc.
 #include <cstdint>           // intXX_t
 #include <cstring>           // memcpy()
+#include <cmath>             // std::ceil, std::floor, std::trunc, std::round, std::sqrt, etc.
 #include <functional>        // std::plus, std::minus, std::multiplies, etc.
 #include <initializer_list>  // std::initializer_list
 
@@ -40,6 +41,11 @@
 
     T vals[N];
 
+    // Methods belong here in the class declaration of Vec only if:
+    //   - they must be here, like constructors or operator[];
+    //   - they'll definitely never want a specialized implementation.
+    // Other operations on Vec should be defined outside the type.
+
     Vec() = default;
 
     Vec(T x) { std::fill(vals,vals+N, x); }
@@ -50,6 +56,15 @@
 
     T  operator[](int i) const { return vals[i]; }
     T& operator[](int i)       { return vals[i]; }
+
+    static Vec Load(const void* ptr) {
+        Vec v;
+        memcpy(&v, ptr, sizeof(Vec));
+        return v;
+    }
+    void store(void* ptr) const {
+        memcpy(ptr, this, sizeof(Vec));
+    }
 };
 
 
@@ -64,7 +79,7 @@
 
 #if defined(__GNUC__) && !defined(__clang__) && defined(__SSE__)
     // GCC warns about ABI changes when returning >= 32 byte vectors when -mavx is not enabled.
-    // The functions that do that (BitPun::operator U() and vext()) are marked ALWAYS_INLINE,
+    // The functions that do that (BitPun::operator U() and to_vext()) are marked ALWAYS_INLINE,
     // so we can just stifle the warning.
     #pragma GCC diagnostic ignored "-Wpsabi"
 #endif
@@ -86,10 +101,10 @@
 static inline ALWAYS_INLINE BitPun<V> bit_pun(V v) { return {v}; }
 
 // Translate from a value type T to its corresponding Mask, the result of a comparison.
-template <typename T> struct Mask { using type = T; };
-template <> struct Mask<float > { using type = int32_t; };
-template <> struct Mask<double> { using type = int64_t; };
-template <typename T> using mask = typename Mask<T>::type;
+template <typename T> struct MaskHelper { using type = T; };
+template <> struct MaskHelper<float > { using type = int32_t; };
+template <> struct MaskHelper<double> { using type = int64_t; };
+template <typename T> using Mask = typename MaskHelper<T>::type;
 
 
 // Apply op() to each lane of one or two input vectors, returning a new vector of the results.
@@ -112,7 +127,7 @@
 //    2) fall back to portable implementations when not.
 // At the end we can drop in platform-specific implementations that override these defaults.
 
-#if 1 && !defined(SKNX_NO_SIMD) && (defined(__clang__) || defined(__GNUC__))
+#if !defined(SKNX_NO_SIMD) && (defined(__clang__) || defined(__GNUC__))
 
     // VExt<N,T> types have the same size as Vec<N,T> and support most operations directly.
     // N.B. VExt<N,T> alignment is N*alignof(T), stricter than Vec<N,T>'s alignof(T).
@@ -131,30 +146,30 @@
         using VExt = typename VExtHelper<N,T>::type;
     #endif
 
-    ___ VExt<N,T> vext(Vec<N,T> v) { return bit_pun(v); }
+    ___ VExt<N,T> to_vext(Vec<N,T> v) { return bit_pun(v); }
 
-    ___ Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) + vext(y)); }
-    ___ Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) - vext(y)); }
-    ___ Vec<N,T> operator*(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) * vext(y)); }
-    ___ Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) / vext(y)); }
+    ___ Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) + to_vext(y)); }
+    ___ Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) - to_vext(y)); }
+    ___ Vec<N,T> operator*(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) * to_vext(y)); }
+    ___ Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) / to_vext(y)); }
 
-    ___ Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) ^ vext(y)); }
-    ___ Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) & vext(y)); }
-    ___ Vec<N,T> operator|(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) | vext(y)); }
+    ___ Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) ^ to_vext(y)); }
+    ___ Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) & to_vext(y)); }
+    ___ Vec<N,T> operator|(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) | to_vext(y)); }
 
-    ___ Vec<N,T> operator!(Vec<N,T> x) { return bit_pun(!vext(x)); }
-    ___ Vec<N,T> operator-(Vec<N,T> x) { return bit_pun(-vext(x)); }
-    ___ Vec<N,T> operator~(Vec<N,T> x) { return bit_pun(~vext(x)); }
+    ___ Vec<N,T> operator!(Vec<N,T> x) { return bit_pun(!to_vext(x)); }
+    ___ Vec<N,T> operator-(Vec<N,T> x) { return bit_pun(-to_vext(x)); }
+    ___ Vec<N,T> operator~(Vec<N,T> x) { return bit_pun(~to_vext(x)); }
 
-    ___ Vec<N,T> operator<<(Vec<N,T> x, int bits) { return bit_pun(vext(x) << bits); }
-    ___ Vec<N,T> operator>>(Vec<N,T> x, int bits) { return bit_pun(vext(x) >> bits); }
+    ___ Vec<N,T> operator<<(Vec<N,T> x, int bits) { return bit_pun(to_vext(x) << bits); }
+    ___ Vec<N,T> operator>>(Vec<N,T> x, int bits) { return bit_pun(to_vext(x) >> bits); }
 
-    ___ Vec<N, mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) == vext(y)); }
-    ___ Vec<N, mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) != vext(y)); }
-    ___ Vec<N, mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) <= vext(y)); }
-    ___ Vec<N, mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) >= vext(y)); }
-    ___ Vec<N, mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) <  vext(y)); }
-    ___ Vec<N, mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return bit_pun(vext(x) >  vext(y)); }
+    ___ Vec<N, Mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) == to_vext(y)); }
+    ___ Vec<N, Mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) != to_vext(y)); }
+    ___ Vec<N, Mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) <= to_vext(y)); }
+    ___ Vec<N, Mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) >= to_vext(y)); }
+    ___ Vec<N, Mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) <  to_vext(y)); }
+    ___ Vec<N, Mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return bit_pun(to_vext(x) >  to_vext(y)); }
 
 #else
 
@@ -177,36 +192,120 @@
     ___ Vec<N,T> operator<<(Vec<N,T> x, int bits) { return map(x, [bits](T a) { return a << bits; }); }
     ___ Vec<N,T> operator>>(Vec<N,T> x, int bits) { return map(x, [bits](T a) { return a >> bits; }); }
 
-    ___ Vec<N, mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> mask<T> { return a == b ? ~0 : 0; }); }
-    ___ Vec<N, mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> mask<T> { return a != b ? ~0 : 0; }); }
-    ___ Vec<N, mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> mask<T> { return a <= b ? ~0 : 0; }); }
-    ___ Vec<N, mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> mask<T> { return a >= b ? ~0 : 0; }); }
-    ___ Vec<N, mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> mask<T> { return a <  b ? ~0 : 0; }); }
-    ___ Vec<N, mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> mask<T> { return a >  b ? ~0 : 0; }); }
+    ___ Vec<N, Mask<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a == b ? ~0 : 0; }); }
+    ___ Vec<N, Mask<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a != b ? ~0 : 0; }); }
+    ___ Vec<N, Mask<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a <= b ? ~0 : 0; }); }
+    ___ Vec<N, Mask<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a >= b ? ~0 : 0; }); }
+    ___ Vec<N, Mask<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a <  b ? ~0 : 0; }); }
+    ___ Vec<N, Mask<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) -> Mask<T> { return a >  b ? ~0 : 0; }); }
 #endif
 
 // Some operations we want are not expressible with Clang/GCC vector extensions,
 // so we implement them using the same approach as the alternate path above.
 
-___ Vec<N,T> if_then_else(Vec<N,mask<T>> cond, Vec<N,T> t, Vec<N,T> e) {
-    Vec<N,mask<T>> t_bits = bit_pun(t),
+___ Vec<N,T> if_then_else(Vec<N,Mask<T>> cond, Vec<N,T> t, Vec<N,T> e) {
+    Vec<N,Mask<T>> t_bits = bit_pun(t),
                    e_bits = bit_pun(e);
     return bit_pun( (cond & t_bits) | (~cond & e_bits) );
 }
 
+___ const T* begin(const Vec<N,T>& x) { return x.vals  ; }
+___       T* begin(      Vec<N,T>& x) { return x.vals  ; }
+___ const T*   end(const Vec<N,T>& x) { return x.vals+N; }
+___       T*   end(      Vec<N,T>& x) { return x.vals+N; }
+
 ___ Vec<N,T> min(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) { return std::min(a,b); }); }
 ___ Vec<N,T> max(Vec<N,T> x, Vec<N,T> y) { return map(x,y, [](T a, T b) { return std::max(a,b); }); }
 
+// Scalar/vector operations just splat the scalar to a vector...
+___ Vec<N,T>       operator+ (T x, Vec<N,T> y) { return Vec<N,T>(x) +  y; }
+___ Vec<N,T>       operator- (T x, Vec<N,T> y) { return Vec<N,T>(x) -  y; }
+___ Vec<N,T>       operator* (T x, Vec<N,T> y) { return Vec<N,T>(x) *  y; }
+___ Vec<N,T>       operator/ (T x, Vec<N,T> y) { return Vec<N,T>(x) /  y; }
+___ Vec<N,T>       operator^ (T x, Vec<N,T> y) { return Vec<N,T>(x) ^  y; }
+___ Vec<N,T>       operator& (T x, Vec<N,T> y) { return Vec<N,T>(x) &  y; }
+___ Vec<N,T>       operator| (T x, Vec<N,T> y) { return Vec<N,T>(x) |  y; }
+___ Vec<N,Mask<T>> operator==(T x, Vec<N,T> y) { return Vec<N,T>(x) == y; }
+___ Vec<N,Mask<T>> operator!=(T x, Vec<N,T> y) { return Vec<N,T>(x) != y; }
+___ Vec<N,Mask<T>> operator<=(T x, Vec<N,T> y) { return Vec<N,T>(x) <= y; }
+___ Vec<N,Mask<T>> operator>=(T x, Vec<N,T> y) { return Vec<N,T>(x) >= y; }
+___ Vec<N,Mask<T>> operator< (T x, Vec<N,T> y) { return Vec<N,T>(x) <  y; }
+___ Vec<N,Mask<T>> operator> (T x, Vec<N,T> y) { return Vec<N,T>(x) >  y; }
+___ Vec<N,T>              min(T x, Vec<N,T> y) { return min(Vec<N,T>(x), y); }
+___ Vec<N,T>              max(T x, Vec<N,T> y) { return max(Vec<N,T>(x), y); }
+// ... and same deal for vector/scalar operations.
+___ Vec<N,T>       operator+ (Vec<N,T> x, T y) { return x +  Vec<N,T>(y); }
+___ Vec<N,T>       operator- (Vec<N,T> x, T y) { return x -  Vec<N,T>(y); }
+___ Vec<N,T>       operator* (Vec<N,T> x, T y) { return x *  Vec<N,T>(y); }
+___ Vec<N,T>       operator/ (Vec<N,T> x, T y) { return x /  Vec<N,T>(y); }
+___ Vec<N,T>       operator^ (Vec<N,T> x, T y) { return x ^  Vec<N,T>(y); }
+___ Vec<N,T>       operator& (Vec<N,T> x, T y) { return x &  Vec<N,T>(y); }
+___ Vec<N,T>       operator| (Vec<N,T> x, T y) { return x |  Vec<N,T>(y); }
+___ Vec<N,Mask<T>> operator==(Vec<N,T> x, T y) { return x == Vec<N,T>(y); }
+___ Vec<N,Mask<T>> operator!=(Vec<N,T> x, T y) { return x != Vec<N,T>(y); }
+___ Vec<N,Mask<T>> operator<=(Vec<N,T> x, T y) { return x <= Vec<N,T>(y); }
+___ Vec<N,Mask<T>> operator>=(Vec<N,T> x, T y) { return x >= Vec<N,T>(y); }
+___ Vec<N,Mask<T>> operator< (Vec<N,T> x, T y) { return x <  Vec<N,T>(y); }
+___ Vec<N,Mask<T>> operator> (Vec<N,T> x, T y) { return x >  Vec<N,T>(y); }
+___ Vec<N,T>              min(Vec<N,T> x, T y) { return min(x, Vec<N,T>(y)); }
+___ Vec<N,T>              max(Vec<N,T> x, T y) { return max(x, Vec<N,T>(y)); }
+
+// The various op= operators, for vectors...
+___ Vec<N,T>& operator+=(Vec<N,T>& x, Vec<N,T> y) { return (x = x + y); }
+___ Vec<N,T>& operator-=(Vec<N,T>& x, Vec<N,T> y) { return (x = x - y); }
+___ Vec<N,T>& operator*=(Vec<N,T>& x, Vec<N,T> y) { return (x = x * y); }
+___ Vec<N,T>& operator/=(Vec<N,T>& x, Vec<N,T> y) { return (x = x / y); }
+___ Vec<N,T>& operator^=(Vec<N,T>& x, Vec<N,T> y) { return (x = x ^ y); }
+___ Vec<N,T>& operator&=(Vec<N,T>& x, Vec<N,T> y) { return (x = x & y); }
+___ Vec<N,T>& operator|=(Vec<N,T>& x, Vec<N,T> y) { return (x = x | y); }
+// ... for scalars...
+___ Vec<N,T>& operator+=(Vec<N,T>& x, T y) { return (x = x + Vec<N,T>(y)); }
+___ Vec<N,T>& operator-=(Vec<N,T>& x, T y) { return (x = x - Vec<N,T>(y)); }
+___ Vec<N,T>& operator*=(Vec<N,T>& x, T y) { return (x = x * Vec<N,T>(y)); }
+___ Vec<N,T>& operator/=(Vec<N,T>& x, T y) { return (x = x / Vec<N,T>(y)); }
+___ Vec<N,T>& operator^=(Vec<N,T>& x, T y) { return (x = x ^ Vec<N,T>(y)); }
+___ Vec<N,T>& operator&=(Vec<N,T>& x, T y) { return (x = x & Vec<N,T>(y)); }
+___ Vec<N,T>& operator|=(Vec<N,T>& x, T y) { return (x = x | Vec<N,T>(y)); }
+// ... and for shifts.
+___ Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); }
+___ Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); }
+
+___ Vec<N,T>  ceil(Vec<N,T> x) { return map(x, [](T a) { return std:: ceil(a); }); }
+___ Vec<N,T> floor(Vec<N,T> x) { return map(x, [](T a) { return std::floor(a); }); }
+___ Vec<N,T> trunc(Vec<N,T> x) { return map(x, [](T a) { return std::trunc(a); }); }
+___ Vec<N,T> round(Vec<N,T> x) { return map(x, [](T a) { return std::round(a); }); }
+___ Vec<N,T>  sqrt(Vec<N,T> x) { return map(x, [](T a) { return std:: sqrt(a); }); }
+
+___ Vec<N,T>   abs(Vec<N,T> x) { return if_then_else(x < T(0), -x, x); }
+___ Vec<N,T>   rcp(Vec<N,T> x) { return T(1) / x; }
+___ Vec<N,T> rsqrt(Vec<N,T> x) { return rcp(sqrt(x)); }
+
+
 ___ T min(Vec<N,T> x) { return *std::min_element(x.vals, x.vals+N); }
 ___ T max(Vec<N,T> x) { return *std::max_element(x.vals, x.vals+N); }
 
-___ bool any(Vec<N,T> x) { return std::any_of(x.vals, x.vals+N, [](T a) { return a != mask<T>(0); }); }
-___ bool all(Vec<N,T> x) { return std::all_of(x.vals, x.vals+N, [](T a) { return a != mask<T>(0); }); }
+___ bool any(Vec<N,T> x) { return std::any_of(x.vals, x.vals+N, [](T a) { return a != Mask<T>(0); }); }
+___ bool all(Vec<N,T> x) { return std::all_of(x.vals, x.vals+N, [](T a) { return a != Mask<T>(0); }); }
 
 // Platform-specific specializations and overloads can now drop in here.
 
 }  // namespace skvx
 
+// Since cast() takes an extra template argument D (the type to cast to),
+// argument-dependent lookup won't let us just type cast<D>(...), instead
+// skvx::cast<D>(...).  That's annoying given how nice all the other methods
+// are, so we'll just move this guy outside into the global namespace.
+// That's pretty harmless... it still only works on skvx::Vec types.
+template <typename D, int N, typename S>
+static inline ALWAYS_INLINE skvx::Vec<N,D> cast(skvx::Vec<N,S> src) {
+#if !defined(SKNX_NO_SIMD) && defined(__clang__)
+    return skvx::bit_pun(__builtin_convertvector(skvx::to_vext(src), skvx::VExt<N,D>));
+#else
+    return skvx::map(src, [](S a) { return (D)a; });
+#endif
+}
+
+
 #undef ALWAYS_INLINE
 #undef ___
 
diff --git a/tests/SkVxTest.cpp b/tests/SkVxTest.cpp
index 71290ad..1603843c 100644
--- a/tests/SkVxTest.cpp
+++ b/tests/SkVxTest.cpp
@@ -76,4 +76,37 @@
 
     REPORTER_ASSERT(r, all(if_then_else(float4{1,2,3,2} <= float4{2,2,2,2}, float4(42), float4(47))
                            == float4{42,42,47,42}));
+
+    REPORTER_ASSERT(r, all(floor(float4{-1.5f,1.5f,1.0f,-1.0f}) == float4{-2.0f,1.0f,1.0f,-1.0f}));
+    REPORTER_ASSERT(r, all( ceil(float4{-1.5f,1.5f,1.0f,-1.0f}) == float4{-1.0f,2.0f,1.0f,-1.0f}));
+    REPORTER_ASSERT(r, all(trunc(float4{-1.5f,1.5f,1.0f,-1.0f}) == float4{-1.0f,1.0f,1.0f,-1.0f}));
+    REPORTER_ASSERT(r, all(round(float4{-1.5f,1.5f,1.0f,-1.0f}) == float4{-2.0f,2.0f,1.0f,-1.0f}));
+
+
+    REPORTER_ASSERT(r, all(abs(float4{-2,-1,0,1}) == float4{2,1,0,1}));
+
+    // TODO(mtklein): these tests could be made less loose.
+    REPORTER_ASSERT(r, all( sqrt(float4{2,3,4,5}) < float4{2,2,3,3}));
+    REPORTER_ASSERT(r, all(  rcp(float4{2,3,4,5}) < float4{1.0f,0.5f,0.5f,0.3f}));
+    REPORTER_ASSERT(r, all(rsqrt(float4{2,3,4,5}) < float4{1.0f,1.0f,1.0f,0.5f}));
+
+    REPORTER_ASSERT(r, all(cast<int>(float4{-1.5f,0.5f,1.0f,1.5f}) == int4{-1,0,1,1}));
+
+    float buf[4] = {1,2,3,4};
+    REPORTER_ASSERT(r, all(float4::Load(buf) == float4{1,2,3,4}));
+    float4{2,3,4,5}.store(buf);
+    REPORTER_ASSERT(r, all(float4::Load(buf) == float4{2,3,4,5}));
+
+    {
+
+        int4 iota = {0,1,2,3};
+        int i = 0;
+        for (int val : iota) {
+            REPORTER_ASSERT(r, val == i++);
+        }
+        for (int& val : iota) {
+            val = 42;
+        }
+        REPORTER_ASSERT(r, all(iota == 42));
+    }
 }