pass SkVx::Vec arguments as const&

Yet another surprising finding when looking at ARM code generation is
that passing these values to functions by const& does make a difference,
even when fully inlined.  I can only guess that the compiler's somehow
more sure that way that the values won't change?  Anyway, convert all
skvx functions that take Vec arguments to take const Vec& instead.

This tweak is enough to let the natural implementation of mull()
actually produce good code generation, so I've promoted that to SkVx.h
and added a unit test.  Notice in the NEON case we've got a base case at
N=8 and two recursive cases, one down to 8 as usual when N > 8, but also
one up to 8 when N < 8.

This also is another big speedup for ARMv7 NEON, bringing it to nearly
the same speed as ARMv8 NEON on the same device.

Bug: chromium:952502
Change-Id: I0f19bab45cf02222ccc8090053ea2a4a380f1dfe
Reviewed-on: https://skia-review.googlesource.com/c/skia/+/208582
Commit-Queue: Michael Ludwig <michaelludwig@google.com>
Auto-Submit: Mike Klein <mtklein@google.com>
Reviewed-by: Michael Ludwig <michaelludwig@google.com>
diff --git a/include/private/SkVx.h b/include/private/SkVx.h
index 66d63d9..a05301f 100644
--- a/include/private/SkVx.h
+++ b/include/private/SkVx.h
@@ -114,7 +114,7 @@
               static inline
 
 template <typename D, typename S>
-static inline D bit_pun(S s) {
+static inline D bit_pun(const S& s) {
     static_assert(sizeof(D) == sizeof(S), "");
     D d;
     memcpy(&d, &s, sizeof(D));
@@ -128,7 +128,7 @@
 template <typename T> using M = typename Mask<T>::type;
 
 // Join two Vec<N,T> into one Vec<2N,T>.
-SINT Vec<2*N,T> join(Vec<N,T> lo, Vec<N,T> hi) {
+SINT Vec<2*N,T> join(const Vec<N,T>& lo, const Vec<N,T>& hi) {
     Vec<2*N,T> v;
     v.lo = lo;
     v.hi = hi;
@@ -162,31 +162,31 @@
         static inline Vec<4,float> to_vec(VExt<4,float> v) { return bit_pun<Vec<4,float>>(v); }
     #endif
 
-    SINT VExt<N,T> to_vext(Vec<N,T> v) { return bit_pun<VExt<N,T>>(v); }
-    SINT Vec <N,T> to_vec(VExt<N,T> v) { return bit_pun<Vec <N,T>>(v); }
+    SINT VExt<N,T> to_vext(const Vec<N,T>& v) { return bit_pun<VExt<N,T>>(v); }
+    SINT Vec <N,T> to_vec(const VExt<N,T>& v) { return bit_pun<Vec <N,T>>(v); }
 
-    SINT Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return to_vec<N,T>(to_vext(x) + to_vext(y)); }
-    SINT Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return to_vec<N,T>(to_vext(x) - to_vext(y)); }
-    SINT Vec<N,T> operator*(Vec<N,T> x, Vec<N,T> y) { return to_vec<N,T>(to_vext(x) * to_vext(y)); }
-    SINT Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return to_vec<N,T>(to_vext(x) / to_vext(y)); }
+    SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) { return to_vec<N,T>(to_vext(x) + to_vext(y)); }
+    SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) { return to_vec<N,T>(to_vext(x) - to_vext(y)); }
+    SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) { return to_vec<N,T>(to_vext(x) * to_vext(y)); }
+    SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) { return to_vec<N,T>(to_vext(x) / to_vext(y)); }
 
-    SINT Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return to_vec<N,T>(to_vext(x) ^ to_vext(y)); }
-    SINT Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return to_vec<N,T>(to_vext(x) & to_vext(y)); }
-    SINT Vec<N,T> operator|(Vec<N,T> x, Vec<N,T> y) { return to_vec<N,T>(to_vext(x) | to_vext(y)); }
+    SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) { return to_vec<N,T>(to_vext(x) ^ to_vext(y)); }
+    SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) { return to_vec<N,T>(to_vext(x) & to_vext(y)); }
+    SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) { return to_vec<N,T>(to_vext(x) | to_vext(y)); }
 
-    SINT Vec<N,T> operator!(Vec<N,T> x) { return to_vec<N,T>(!to_vext(x)); }
-    SINT Vec<N,T> operator-(Vec<N,T> x) { return to_vec<N,T>(-to_vext(x)); }
-    SINT Vec<N,T> operator~(Vec<N,T> x) { return to_vec<N,T>(~to_vext(x)); }
+    SINT Vec<N,T> operator!(const Vec<N,T>& x) { return to_vec<N,T>(!to_vext(x)); }
+    SINT Vec<N,T> operator-(const Vec<N,T>& x) { return to_vec<N,T>(-to_vext(x)); }
+    SINT Vec<N,T> operator~(const Vec<N,T>& x) { return to_vec<N,T>(~to_vext(x)); }
 
-    SINT Vec<N,T> operator<<(Vec<N,T> x, int bits) { return to_vec<N,T>(to_vext(x) << bits); }
-    SINT Vec<N,T> operator>>(Vec<N,T> x, int bits) { return to_vec<N,T>(to_vext(x) >> bits); }
+    SINT Vec<N,T> operator<<(const Vec<N,T>& x, int bits) { return to_vec<N,T>(to_vext(x) << bits); }
+    SINT Vec<N,T> operator>>(const Vec<N,T>& x, int bits) { return to_vec<N,T>(to_vext(x) >> bits); }
 
-    SINT Vec<N,M<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) == to_vext(y)); }
-    SINT Vec<N,M<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) != to_vext(y)); }
-    SINT Vec<N,M<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) <= to_vext(y)); }
-    SINT Vec<N,M<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) >= to_vext(y)); }
-    SINT Vec<N,M<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) <  to_vext(y)); }
-    SINT Vec<N,M<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) >  to_vext(y)); }
+    SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) == to_vext(y)); }
+    SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) != to_vext(y)); }
+    SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) <= to_vext(y)); }
+    SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) >= to_vext(y)); }
+    SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) <  to_vext(y)); }
+    SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) { return bit_pun<Vec<N,M<T>>>(to_vext(x) >  to_vext(y)); }
 
 #else
 
@@ -194,165 +194,165 @@
     // We'll implement things portably, in a way that should be easily autovectorizable.
 
     // N == 1 scalar implementations.
-    SIT Vec<1,T> operator+(Vec<1,T> x, Vec<1,T> y) { return x.val + y.val; }
-    SIT Vec<1,T> operator-(Vec<1,T> x, Vec<1,T> y) { return x.val - y.val; }
-    SIT Vec<1,T> operator*(Vec<1,T> x, Vec<1,T> y) { return x.val * y.val; }
-    SIT Vec<1,T> operator/(Vec<1,T> x, Vec<1,T> y) { return x.val / y.val; }
+    SIT Vec<1,T> operator+(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val + y.val; }
+    SIT Vec<1,T> operator-(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val - y.val; }
+    SIT Vec<1,T> operator*(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val * y.val; }
+    SIT Vec<1,T> operator/(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val / y.val; }
 
-    SIT Vec<1,T> operator^(Vec<1,T> x, Vec<1,T> y) { return x.val ^ y.val; }
-    SIT Vec<1,T> operator&(Vec<1,T> x, Vec<1,T> y) { return x.val & y.val; }
-    SIT Vec<1,T> operator|(Vec<1,T> x, Vec<1,T> y) { return x.val | y.val; }
+    SIT Vec<1,T> operator^(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val ^ y.val; }
+    SIT Vec<1,T> operator&(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val & y.val; }
+    SIT Vec<1,T> operator|(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val | y.val; }
 
-    SIT Vec<1,T> operator!(Vec<1,T> x) { return !x.val; }
-    SIT Vec<1,T> operator-(Vec<1,T> x) { return -x.val; }
-    SIT Vec<1,T> operator~(Vec<1,T> x) { return ~x.val; }
+    SIT Vec<1,T> operator!(const Vec<1,T>& x) { return !x.val; }
+    SIT Vec<1,T> operator-(const Vec<1,T>& x) { return -x.val; }
+    SIT Vec<1,T> operator~(const Vec<1,T>& x) { return ~x.val; }
 
-    SIT Vec<1,T> operator<<(Vec<1,T> x, int bits) { return x.val << bits; }
-    SIT Vec<1,T> operator>>(Vec<1,T> x, int bits) { return x.val >> bits; }
+    SIT Vec<1,T> operator<<(const Vec<1,T>& x, int bits) { return x.val << bits; }
+    SIT Vec<1,T> operator>>(const Vec<1,T>& x, int bits) { return x.val >> bits; }
 
-    SIT Vec<1,M<T>> operator==(Vec<1,T> x, Vec<1,T> y) { return x.val == y.val ? ~0 : 0; }
-    SIT Vec<1,M<T>> operator!=(Vec<1,T> x, Vec<1,T> y) { return x.val != y.val ? ~0 : 0; }
-    SIT Vec<1,M<T>> operator<=(Vec<1,T> x, Vec<1,T> y) { return x.val <= y.val ? ~0 : 0; }
-    SIT Vec<1,M<T>> operator>=(Vec<1,T> x, Vec<1,T> y) { return x.val >= y.val ? ~0 : 0; }
-    SIT Vec<1,M<T>> operator< (Vec<1,T> x, Vec<1,T> y) { return x.val <  y.val ? ~0 : 0; }
-    SIT Vec<1,M<T>> operator> (Vec<1,T> x, Vec<1,T> y) { return x.val >  y.val ? ~0 : 0; }
+    SIT Vec<1,M<T>> operator==(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val == y.val ? ~0 : 0; }
+    SIT Vec<1,M<T>> operator!=(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val != y.val ? ~0 : 0; }
+    SIT Vec<1,M<T>> operator<=(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val <= y.val ? ~0 : 0; }
+    SIT Vec<1,M<T>> operator>=(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val >= y.val ? ~0 : 0; }
+    SIT Vec<1,M<T>> operator< (const Vec<1,T>& x, const Vec<1,T>& y) { return x.val <  y.val ? ~0 : 0; }
+    SIT Vec<1,M<T>> operator> (const Vec<1,T>& x, const Vec<1,T>& y) { return x.val >  y.val ? ~0 : 0; }
 
     // All default N != 1 implementations just recurse on lo and hi halves.
-    SINT Vec<N,T> operator+(Vec<N,T> x, Vec<N,T> y) { return join(x.lo + y.lo, x.hi + y.hi); }
-    SINT Vec<N,T> operator-(Vec<N,T> x, Vec<N,T> y) { return join(x.lo - y.lo, x.hi - y.hi); }
-    SINT Vec<N,T> operator*(Vec<N,T> x, Vec<N,T> y) { return join(x.lo * y.lo, x.hi * y.hi); }
-    SINT Vec<N,T> operator/(Vec<N,T> x, Vec<N,T> y) { return join(x.lo / y.lo, x.hi / y.hi); }
+    SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo + y.lo, x.hi + y.hi); }
+    SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo - y.lo, x.hi - y.hi); }
+    SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo * y.lo, x.hi * y.hi); }
+    SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo / y.lo, x.hi / y.hi); }
 
-    SINT Vec<N,T> operator^(Vec<N,T> x, Vec<N,T> y) { return join(x.lo ^ y.lo, x.hi ^ y.hi); }
-    SINT Vec<N,T> operator&(Vec<N,T> x, Vec<N,T> y) { return join(x.lo & y.lo, x.hi & y.hi); }
-    SINT Vec<N,T> operator|(Vec<N,T> x, Vec<N,T> y) { return join(x.lo | y.lo, x.hi | y.hi); }
+    SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo ^ y.lo, x.hi ^ y.hi); }
+    SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo & y.lo, x.hi & y.hi); }
+    SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo | y.lo, x.hi | y.hi); }
 
-    SINT Vec<N,T> operator!(Vec<N,T> x) { return join(!x.lo, !x.hi); }
-    SINT Vec<N,T> operator-(Vec<N,T> x) { return join(-x.lo, -x.hi); }
-    SINT Vec<N,T> operator~(Vec<N,T> x) { return join(~x.lo, ~x.hi); }
+    SINT Vec<N,T> operator!(const Vec<N,T>& x) { return join(!x.lo, !x.hi); }
+    SINT Vec<N,T> operator-(const Vec<N,T>& x) { return join(-x.lo, -x.hi); }
+    SINT Vec<N,T> operator~(const Vec<N,T>& x) { return join(~x.lo, ~x.hi); }
 
-    SINT Vec<N,T> operator<<(Vec<N,T> x, int bits) { return join(x.lo << bits, x.hi << bits); }
-    SINT Vec<N,T> operator>>(Vec<N,T> x, int bits) { return join(x.lo >> bits, x.hi >> bits); }
+    SINT Vec<N,T> operator<<(const Vec<N,T>& x, int bits) { return join(x.lo << bits, x.hi << bits); }
+    SINT Vec<N,T> operator>>(const Vec<N,T>& x, int bits) { return join(x.lo >> bits, x.hi >> bits); }
 
-    SINT Vec<N,M<T>> operator==(Vec<N,T> x, Vec<N,T> y) { return join(x.lo == y.lo, x.hi == y.hi); }
-    SINT Vec<N,M<T>> operator!=(Vec<N,T> x, Vec<N,T> y) { return join(x.lo != y.lo, x.hi != y.hi); }
-    SINT Vec<N,M<T>> operator<=(Vec<N,T> x, Vec<N,T> y) { return join(x.lo <= y.lo, x.hi <= y.hi); }
-    SINT Vec<N,M<T>> operator>=(Vec<N,T> x, Vec<N,T> y) { return join(x.lo >= y.lo, x.hi >= y.hi); }
-    SINT Vec<N,M<T>> operator< (Vec<N,T> x, Vec<N,T> y) { return join(x.lo <  y.lo, x.hi <  y.hi); }
-    SINT Vec<N,M<T>> operator> (Vec<N,T> x, Vec<N,T> y) { return join(x.lo >  y.lo, x.hi >  y.hi); }
+    SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo == y.lo, x.hi == y.hi); }
+    SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo != y.lo, x.hi != y.hi); }
+    SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo <= y.lo, x.hi <= y.hi); }
+    SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo >= y.lo, x.hi >= y.hi); }
+    SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo <  y.lo, x.hi <  y.hi); }
+    SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) { return join(x.lo >  y.lo, x.hi >  y.hi); }
 #endif
 
 // Some operations we want are not expressible with Clang/GCC vector
 // extensions, so we implement them using the recursive approach.
 
 // N == 1 scalar implementations.
-SIT Vec<1,T> if_then_else(Vec<1,M<T>> cond, Vec<1,T> t, Vec<1,T> e) {
+SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<1,T>& e) {
     auto t_bits = bit_pun<M<T>>(t),
          e_bits = bit_pun<M<T>>(e);
     return bit_pun<T>( (cond.val & t_bits) | (~cond.val & e_bits) );
 }
 
-SIT bool any(Vec<1,T> x) { return x.val != 0; }
-SIT bool all(Vec<1,T> x) { return x.val != 0; }
+SIT bool any(const Vec<1,T>& x) { return x.val != 0; }
+SIT bool all(const Vec<1,T>& x) { return x.val != 0; }
 
-SIT T min(Vec<1,T> x) { return x.val; }
-SIT T max(Vec<1,T> x) { return x.val; }
+SIT T min(const Vec<1,T>& x) { return x.val; }
+SIT T max(const Vec<1,T>& x) { return x.val; }
 
-SIT Vec<1,T> min(Vec<1,T> x, Vec<1,T> y) { return std::min(x.val, y.val); }
-SIT Vec<1,T> max(Vec<1,T> x, Vec<1,T> y) { return std::max(x.val, y.val); }
+SIT Vec<1,T> min(const Vec<1,T>& x, const Vec<1,T>& y) { return std::min(x.val, y.val); }
+SIT Vec<1,T> max(const Vec<1,T>& x, const Vec<1,T>& y) { return std::max(x.val, y.val); }
 
-SIT Vec<1,T>  ceil(Vec<1,T> x) { return std:: ceil(x.val); }
-SIT Vec<1,T> floor(Vec<1,T> x) { return std::floor(x.val); }
-SIT Vec<1,T> trunc(Vec<1,T> x) { return std::trunc(x.val); }
-SIT Vec<1,T> round(Vec<1,T> x) { return std::round(x.val); }
-SIT Vec<1,T>  sqrt(Vec<1,T> x) { return std:: sqrt(x.val); }
-SIT Vec<1,T>   abs(Vec<1,T> x) { return std::  abs(x.val); }
+SIT Vec<1,T>  ceil(const Vec<1,T>& x) { return std:: ceil(x.val); }
+SIT Vec<1,T> floor(const Vec<1,T>& x) { return std::floor(x.val); }
+SIT Vec<1,T> trunc(const Vec<1,T>& x) { return std::trunc(x.val); }
+SIT Vec<1,T> round(const Vec<1,T>& x) { return std::round(x.val); }
+SIT Vec<1,T>  sqrt(const Vec<1,T>& x) { return std:: sqrt(x.val); }
+SIT Vec<1,T>   abs(const Vec<1,T>& x) { return std::  abs(x.val); }
 
-SIT Vec<1,T>   rcp(Vec<1,T> x) { return 1 / x.val; }
-SIT Vec<1,T> rsqrt(Vec<1,T> x) { return rcp(sqrt(x)); }
-SIT Vec<1,T>   mad(Vec<1,T> f,
-                   Vec<1,T> m,
-                   Vec<1,T> a) { return f*m+a; }
+SIT Vec<1,T>   rcp(const Vec<1,T>& x) { return 1 / x.val; }
+SIT Vec<1,T> rsqrt(const Vec<1,T>& x) { return rcp(sqrt(x)); }
+SIT Vec<1,T>   mad(const Vec<1,T>& f,
+                   const Vec<1,T>& m,
+                   const Vec<1,T>& a) { return f*m+a; }
 
 // All default N != 1 implementations just recurse on lo and hi halves.
-SINT Vec<N,T> if_then_else(Vec<N,M<T>> cond, Vec<N,T> t, Vec<N,T> e) {
+SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) {
     return join(if_then_else(cond.lo, t.lo, e.lo),
                 if_then_else(cond.hi, t.hi, e.hi));
 }
 
-SINT bool any(Vec<N,T> x) { return any(x.lo) || any(x.hi); }
-SINT bool all(Vec<N,T> x) { return all(x.lo) && all(x.hi); }
+SINT bool any(const Vec<N,T>& x) { return any(x.lo) || any(x.hi); }
+SINT bool all(const Vec<N,T>& x) { return all(x.lo) && all(x.hi); }
 
-SINT T min(Vec<N,T> x) { return std::min(min(x.lo), min(x.hi)); }
-SINT T max(Vec<N,T> x) { return std::max(max(x.lo), max(x.hi)); }
+SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); }
+SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); }
 
-SINT Vec<N,T> min(Vec<N,T> x, Vec<N,T> y) { return join(min(x.lo, y.lo), min(x.hi, y.hi)); }
-SINT Vec<N,T> max(Vec<N,T> x, Vec<N,T> y) { return join(max(x.lo, y.lo), max(x.hi, y.hi)); }
+SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return join(min(x.lo, y.lo), min(x.hi, y.hi)); }
+SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return join(max(x.lo, y.lo), max(x.hi, y.hi)); }
 
-SINT Vec<N,T>  ceil(Vec<N,T> x) { return join( ceil(x.lo),  ceil(x.hi)); }
-SINT Vec<N,T> floor(Vec<N,T> x) { return join(floor(x.lo), floor(x.hi)); }
-SINT Vec<N,T> trunc(Vec<N,T> x) { return join(trunc(x.lo), trunc(x.hi)); }
-SINT Vec<N,T> round(Vec<N,T> x) { return join(round(x.lo), round(x.hi)); }
-SINT Vec<N,T>  sqrt(Vec<N,T> x) { return join( sqrt(x.lo),  sqrt(x.hi)); }
-SINT Vec<N,T>   abs(Vec<N,T> x) { return join(  abs(x.lo),   abs(x.hi)); }
+SINT Vec<N,T>  ceil(const Vec<N,T>& x) { return join( ceil(x.lo),  ceil(x.hi)); }
+SINT Vec<N,T> floor(const Vec<N,T>& x) { return join(floor(x.lo), floor(x.hi)); }
+SINT Vec<N,T> trunc(const Vec<N,T>& x) { return join(trunc(x.lo), trunc(x.hi)); }
+SINT Vec<N,T> round(const Vec<N,T>& x) { return join(round(x.lo), round(x.hi)); }
+SINT Vec<N,T>  sqrt(const Vec<N,T>& x) { return join( sqrt(x.lo),  sqrt(x.hi)); }
+SINT Vec<N,T>   abs(const Vec<N,T>& x) { return join(  abs(x.lo),   abs(x.hi)); }
 
-SINT Vec<N,T>   rcp(Vec<N,T> x) { return join(  rcp(x.lo),   rcp(x.hi)); }
-SINT Vec<N,T> rsqrt(Vec<N,T> x) { return join(rsqrt(x.lo), rsqrt(x.hi)); }
-SINT Vec<N,T>   mad(Vec<N,T> f,
-                    Vec<N,T> m,
-                    Vec<N,T> a) { return join(mad(f.lo, m.lo, a.lo), mad(f.hi, m.hi, a.hi)); }
+SINT Vec<N,T>   rcp(const Vec<N,T>& x) { return join(  rcp(x.lo),   rcp(x.hi)); }
+SINT Vec<N,T> rsqrt(const Vec<N,T>& x) { return join(rsqrt(x.lo), rsqrt(x.hi)); }
+SINT Vec<N,T>   mad(const Vec<N,T>& f,
+                    const Vec<N,T>& m,
+                    const Vec<N,T>& a) { return join(mad(f.lo, m.lo, a.lo), mad(f.hi, m.hi, a.hi)); }
 
 
 // Scalar/vector operations just splat the scalar to a vector...
-SINTU Vec<N,T>    operator+ (U x, Vec<N,T> y) { return Vec<N,T>(x) +  y; }
-SINTU Vec<N,T>    operator- (U x, Vec<N,T> y) { return Vec<N,T>(x) -  y; }
-SINTU Vec<N,T>    operator* (U x, Vec<N,T> y) { return Vec<N,T>(x) *  y; }
-SINTU Vec<N,T>    operator/ (U x, Vec<N,T> y) { return Vec<N,T>(x) /  y; }
-SINTU Vec<N,T>    operator^ (U x, Vec<N,T> y) { return Vec<N,T>(x) ^  y; }
-SINTU Vec<N,T>    operator& (U x, Vec<N,T> y) { return Vec<N,T>(x) &  y; }
-SINTU Vec<N,T>    operator| (U x, Vec<N,T> y) { return Vec<N,T>(x) |  y; }
-SINTU Vec<N,M<T>> operator==(U x, Vec<N,T> y) { return Vec<N,T>(x) == y; }
-SINTU Vec<N,M<T>> operator!=(U x, Vec<N,T> y) { return Vec<N,T>(x) != y; }
-SINTU Vec<N,M<T>> operator<=(U x, Vec<N,T> y) { return Vec<N,T>(x) <= y; }
-SINTU Vec<N,M<T>> operator>=(U x, Vec<N,T> y) { return Vec<N,T>(x) >= y; }
-SINTU Vec<N,M<T>> operator< (U x, Vec<N,T> y) { return Vec<N,T>(x) <  y; }
-SINTU Vec<N,M<T>> operator> (U x, Vec<N,T> y) { return Vec<N,T>(x) >  y; }
-SINTU Vec<N,T>           min(U x, Vec<N,T> y) { return min(Vec<N,T>(x), y); }
-SINTU Vec<N,T>           max(U x, Vec<N,T> y) { return max(Vec<N,T>(x), y); }
+SINTU Vec<N,T>    operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) +  y; }
+SINTU Vec<N,T>    operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) -  y; }
+SINTU Vec<N,T>    operator* (U x, const Vec<N,T>& y) { return Vec<N,T>(x) *  y; }
+SINTU Vec<N,T>    operator/ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) /  y; }
+SINTU Vec<N,T>    operator^ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) ^  y; }
+SINTU Vec<N,T>    operator& (U x, const Vec<N,T>& y) { return Vec<N,T>(x) &  y; }
+SINTU Vec<N,T>    operator| (U x, const Vec<N,T>& y) { return Vec<N,T>(x) |  y; }
+SINTU Vec<N,M<T>> operator==(U x, const Vec<N,T>& y) { return Vec<N,T>(x) == y; }
+SINTU Vec<N,M<T>> operator!=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) != y; }
+SINTU Vec<N,M<T>> operator<=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) <= y; }
+SINTU Vec<N,M<T>> operator>=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) >= y; }
+SINTU Vec<N,M<T>> operator< (U x, const Vec<N,T>& y) { return Vec<N,T>(x) <  y; }
+SINTU Vec<N,M<T>> operator> (U x, const Vec<N,T>& y) { return Vec<N,T>(x) >  y; }
+SINTU Vec<N,T>           min(U x, const Vec<N,T>& y) { return min(Vec<N,T>(x), y); }
+SINTU Vec<N,T>           max(U x, const Vec<N,T>& y) { return max(Vec<N,T>(x), y); }
 
 // ... and same deal for vector/scalar operations.
-SINTU Vec<N,T>    operator+ (Vec<N,T> x, U y) { return x +  Vec<N,T>(y); }
-SINTU Vec<N,T>    operator- (Vec<N,T> x, U y) { return x -  Vec<N,T>(y); }
-SINTU Vec<N,T>    operator* (Vec<N,T> x, U y) { return x *  Vec<N,T>(y); }
-SINTU Vec<N,T>    operator/ (Vec<N,T> x, U y) { return x /  Vec<N,T>(y); }
-SINTU Vec<N,T>    operator^ (Vec<N,T> x, U y) { return x ^  Vec<N,T>(y); }
-SINTU Vec<N,T>    operator& (Vec<N,T> x, U y) { return x &  Vec<N,T>(y); }
-SINTU Vec<N,T>    operator| (Vec<N,T> x, U y) { return x |  Vec<N,T>(y); }
-SINTU Vec<N,M<T>> operator==(Vec<N,T> x, U y) { return x == Vec<N,T>(y); }
-SINTU Vec<N,M<T>> operator!=(Vec<N,T> x, U y) { return x != Vec<N,T>(y); }
-SINTU Vec<N,M<T>> operator<=(Vec<N,T> x, U y) { return x <= Vec<N,T>(y); }
-SINTU Vec<N,M<T>> operator>=(Vec<N,T> x, U y) { return x >= Vec<N,T>(y); }
-SINTU Vec<N,M<T>> operator< (Vec<N,T> x, U y) { return x <  Vec<N,T>(y); }
-SINTU Vec<N,M<T>> operator> (Vec<N,T> x, U y) { return x >  Vec<N,T>(y); }
-SINTU Vec<N,T>           min(Vec<N,T> x, U y) { return min(x, Vec<N,T>(y)); }
-SINTU Vec<N,T>           max(Vec<N,T> x, U y) { return max(x, Vec<N,T>(y)); }
+SINTU Vec<N,T>    operator+ (const Vec<N,T>& x, U y) { return x +  Vec<N,T>(y); }
+SINTU Vec<N,T>    operator- (const Vec<N,T>& x, U y) { return x -  Vec<N,T>(y); }
+SINTU Vec<N,T>    operator* (const Vec<N,T>& x, U y) { return x *  Vec<N,T>(y); }
+SINTU Vec<N,T>    operator/ (const Vec<N,T>& x, U y) { return x /  Vec<N,T>(y); }
+SINTU Vec<N,T>    operator^ (const Vec<N,T>& x, U y) { return x ^  Vec<N,T>(y); }
+SINTU Vec<N,T>    operator& (const Vec<N,T>& x, U y) { return x &  Vec<N,T>(y); }
+SINTU Vec<N,T>    operator| (const Vec<N,T>& x, U y) { return x |  Vec<N,T>(y); }
+SINTU Vec<N,M<T>> operator==(const Vec<N,T>& x, U y) { return x == Vec<N,T>(y); }
+SINTU Vec<N,M<T>> operator!=(const Vec<N,T>& x, U y) { return x != Vec<N,T>(y); }
+SINTU Vec<N,M<T>> operator<=(const Vec<N,T>& x, U y) { return x <= Vec<N,T>(y); }
+SINTU Vec<N,M<T>> operator>=(const Vec<N,T>& x, U y) { return x >= Vec<N,T>(y); }
+SINTU Vec<N,M<T>> operator< (const Vec<N,T>& x, U y) { return x <  Vec<N,T>(y); }
+SINTU Vec<N,M<T>> operator> (const Vec<N,T>& x, U y) { return x >  Vec<N,T>(y); }
+SINTU Vec<N,T>           min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); }
+SINTU Vec<N,T>           max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); }
 
 // All vector/scalar combinations for mad() with at least one vector.
-SINTU Vec<N,T> mad(U f, Vec<N,T> m, Vec<N,T> a) { return Vec<N,T>(f)*m + a; }
-SINTU Vec<N,T> mad(Vec<N,T> f, U m, Vec<N,T> a) { return f*Vec<N,T>(m) + a; }
-SINTU Vec<N,T> mad(Vec<N,T> f, Vec<N,T> m, U a) { return f*m + Vec<N,T>(a); }
-SINTU Vec<N,T> mad(Vec<N,T> f, U m, U a) { return f*Vec<N,T>(m) + Vec<N,T>(a); }
-SINTU Vec<N,T> mad(U f, Vec<N,T> m, U a) { return Vec<N,T>(f)*m + Vec<N,T>(a); }
-SINTU Vec<N,T> mad(U f, U m, Vec<N,T> a) { return Vec<N,T>(f)*Vec<N,T>(m) + a; }
+SINTU Vec<N,T> mad(U f, const Vec<N,T>& m, const Vec<N,T>& a) { return Vec<N,T>(f)*m + a; }
+SINTU Vec<N,T> mad(const Vec<N,T>& f, U m, const Vec<N,T>& a) { return f*Vec<N,T>(m) + a; }
+SINTU Vec<N,T> mad(const Vec<N,T>& f, const Vec<N,T>& m, U a) { return f*m + Vec<N,T>(a); }
+SINTU Vec<N,T> mad(const Vec<N,T>& f, U m, U a) { return f*Vec<N,T>(m) + Vec<N,T>(a); }
+SINTU Vec<N,T> mad(U f, const Vec<N,T>& m, U a) { return Vec<N,T>(f)*m + Vec<N,T>(a); }
+SINTU Vec<N,T> mad(U f, U m, const Vec<N,T>& a) { return Vec<N,T>(f)*Vec<N,T>(m) + a; }
 
 // The various op= operators, for vectors...
-SINT Vec<N,T>& operator+=(Vec<N,T>& x, Vec<N,T> y) { return (x = x + y); }
-SINT Vec<N,T>& operator-=(Vec<N,T>& x, Vec<N,T> y) { return (x = x - y); }
-SINT Vec<N,T>& operator*=(Vec<N,T>& x, Vec<N,T> y) { return (x = x * y); }
-SINT Vec<N,T>& operator/=(Vec<N,T>& x, Vec<N,T> y) { return (x = x / y); }
-SINT Vec<N,T>& operator^=(Vec<N,T>& x, Vec<N,T> y) { return (x = x ^ y); }
-SINT Vec<N,T>& operator&=(Vec<N,T>& x, Vec<N,T> y) { return (x = x & y); }
-SINT Vec<N,T>& operator|=(Vec<N,T>& x, Vec<N,T> y) { return (x = x | y); }
+SINT Vec<N,T>& operator+=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x + y); }
+SINT Vec<N,T>& operator-=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x - y); }
+SINT Vec<N,T>& operator*=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x * y); }
+SINT Vec<N,T>& operator/=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x / y); }
+SINT Vec<N,T>& operator^=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x ^ y); }
+SINT Vec<N,T>& operator&=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x & y); }
+SINT Vec<N,T>& operator|=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x | y); }
 
 // ... for scalars...
 SINTU Vec<N,T>& operator+=(Vec<N,T>& x, U y) { return (x = x + Vec<N,T>(y)); }
@@ -369,10 +369,10 @@
 
 // cast() Vec<N,S> to Vec<N,D>, as if applying a C-cast to each lane.
 template <typename D, typename S>
-static inline Vec<1,D> cast(Vec<1,S> src) { return (D)src.val; }
+static inline Vec<1,D> cast(const Vec<1,S>& src) { return (D)src.val; }
 
 template <typename D, int N, typename S>
-static inline Vec<N,D> cast(Vec<N,S> src) {
+static inline Vec<N,D> cast(const Vec<N,S>& src) {
 #if !defined(SKNX_NO_SIMD) && defined(__clang__)
     return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>));
 #else
@@ -388,7 +388,7 @@
 //    shuffle<3,3,3,3>        (rgba) ~> {A,A,A,A}
 // The only real restriction is that the output also be a legal N=power-of-two sknx::Vec.
 template <int... Ix, int N, typename T>
-static inline Vec<sizeof...(Ix),T> shuffle(Vec<N,T> x) {
+static inline Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>& x) {
 #if !defined(SKNX_NO_SIMD) && defined(__clang__)
     return to_vec<sizeof...(Ix),T>(__builtin_shufflevector(to_vext(x), to_vext(x), Ix...));
 #else
@@ -398,14 +398,14 @@
 
 // div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit.
 template <int N>
-static inline Vec<N,uint8_t> div255(Vec<N,uint16_t> x) {
+static inline Vec<N,uint8_t> div255(const Vec<N,uint16_t>& x) {
     return cast<uint8_t>( (x+127)/255 );
 }
 
 // approx_scale(x,y) approximates div255(cast<uint16_t>(x)*cast<uint16_t>(y)) within a bit,
 // and is always perfect when x or y is 0 or 255.
 template <int N>
-static inline Vec<N,uint8_t> approx_scale(Vec<N,uint8_t> x, Vec<N,uint8_t> y) {
+static inline Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y) {
     // All of (x*y+x)/256, (x*y+y)/256, and (x*y+255)/256 meet the criteria above.
     // We happen to have historically picked (x*y+x)/256.
     auto X = cast<uint16_t>(x),
@@ -413,46 +413,87 @@
     return cast<uint8_t>( (X*Y+X)/256 );
 }
 
+#if !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON)
+    // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).
+    static inline Vec<8,uint16_t> mull(const Vec<8,uint8_t>& x,
+                                       const Vec<8,uint8_t>& y) {
+        return to_vec<8,uint16_t>(vmull_u8(to_vext(x),
+                                           to_vext(y)));
+    }
+
+    template <int N>
+    static inline typename std::enable_if<(N < 8),
+    Vec<N,uint16_t>>::type mull(const Vec<N,uint8_t>& x,
+                                const Vec<N,uint8_t>& y) {
+        // N < 8 --> double up data until N == 8, returning the part we need.
+        return mull(join(x,x),
+                    join(y,y)).lo;
+    }
+
+    template <int N>
+    static inline typename std::enable_if<(N > 8),
+    Vec<N,uint16_t>>::type mull(const Vec<N,uint8_t>& x,
+                                const Vec<N,uint8_t>& y) {
+        // N > 8 --> usual join(lo,hi) strategy to recurse down to N == 8.
+        return join(mull(x.lo, y.lo),
+                    mull(x.hi, y.hi));
+    }
+#else
+    // Nothing special when we don't have NEON... just cast up to 16-bit and multiply.
+    template <int N>
+    static inline Vec<N,uint16_t> mull(const Vec<N,uint8_t>& x,
+                                       const Vec<N,uint8_t>& y) {
+        return cast<uint16_t>(x)
+             * cast<uint16_t>(y);
+    }
+#endif
+
 #if !defined(SKNX_NO_SIMD)
     // Platform-specific specializations and overloads can now drop in here.
 
     #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
-        static inline Vec<4,float> sqrt(Vec<4,float> x) {
+        static inline Vec<4,float> sqrt(const Vec<4,float>& x) {
             return bit_pun<Vec<4,float>>(_mm_sqrt_ps(bit_pun<__m128>(x)));
         }
-        static inline Vec<4,float> rsqrt(Vec<4,float> x) {
+        static inline Vec<4,float> rsqrt(const Vec<4,float>& x) {
             return bit_pun<Vec<4,float>>(_mm_rsqrt_ps(bit_pun<__m128>(x)));
         }
-        static inline Vec<4,float> rcp(Vec<4,float> x) {
+        static inline Vec<4,float> rcp(const Vec<4,float>& x) {
             return bit_pun<Vec<4,float>>(_mm_rcp_ps(bit_pun<__m128>(x)));
         }
 
-        static inline Vec<2,float>  sqrt(Vec<2,float> x) {
+        static inline Vec<2,float>  sqrt(const Vec<2,float>& x) {
             return shuffle<0,1>( sqrt(shuffle<0,1,0,1>(x)));
         }
-        static inline Vec<2,float> rsqrt(Vec<2,float> x) {
+        static inline Vec<2,float> rsqrt(const Vec<2,float>& x) {
             return shuffle<0,1>(rsqrt(shuffle<0,1,0,1>(x)));
         }
-        static inline Vec<2,float>   rcp(Vec<2,float> x) {
+        static inline Vec<2,float>   rcp(const Vec<2,float>& x) {
             return shuffle<0,1>(  rcp(shuffle<0,1,0,1>(x)));
         }
     #endif
 
     #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
-        static inline Vec<4,float> if_then_else(Vec<4,int> c, Vec<4,float> t, Vec<4,float> e) {
+        static inline Vec<4,float> if_then_else(const Vec<4,int  >& c,
+                                                const Vec<4,float>& t,
+                                                const Vec<4,float>& e) {
             return bit_pun<Vec<4,float>>(_mm_blendv_ps(bit_pun<__m128>(e),
                                                        bit_pun<__m128>(t),
                                                        bit_pun<__m128>(c)));
         }
     #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
-        static inline Vec<4,float> if_then_else(Vec<4,int> c, Vec<4,float> t, Vec<4,float> e) {
+        static inline Vec<4,float> if_then_else(const Vec<4,int  >& c,
+                                                const Vec<4,float>& t,
+                                                const Vec<4,float>& e) {
             return bit_pun<Vec<4,float>>(_mm_or_ps(_mm_and_ps   (bit_pun<__m128>(c),
                                                                  bit_pun<__m128>(t)),
                                                    _mm_andnot_ps(bit_pun<__m128>(c),
                                                                  bit_pun<__m128>(e))));
         }
     #elif defined(SK_ARM_HAS_NEON)
-        static inline Vec<4,float> if_then_else(Vec<4,int> c, Vec<4,float> t, Vec<4,float> e) {
+        static inline Vec<4,float> if_then_else(const Vec<4,int  >& c,
+                                                const Vec<4,float>& t,
+                                                const Vec<4,float>& e) {
             return bit_pun<Vec<4,float>>(vbslq_f32(bit_pun<uint32x4_t> (c),
                                                    bit_pun<float32x4_t>(t),
                                                    bit_pun<float32x4_t>(e)));
diff --git a/src/opts/SkBlitRow_opts.h b/src/opts/SkBlitRow_opts.h
index 381c171..70a1e24 100644
--- a/src/opts/SkBlitRow_opts.h
+++ b/src/opts/SkBlitRow_opts.h
@@ -41,32 +41,6 @@
 
 namespace SK_OPTS_NS {
 
-#if defined(SK_ARM_HAS_NEON)
-    // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long).
-    // TODO(mtklein): I wish I could make this a bit prettier and still get ideal codegen.
-    static inline skvx::Vec<4,uint16_t> mull(skvx::Vec<4,uint8_t> x, skvx::Vec<4,uint8_t> y) {
-        return skvx::to_vec<8,uint16_t>( vmull_u8(skvx::to_vext(skvx::join(x,x)),
-                                                  skvx::to_vext(skvx::join(y,y))) )
-            .lo;
-    }
-    static inline skvx::Vec<16,uint16_t> mull(skvx::Vec<16,uint8_t> x, skvx::Vec<16,uint8_t> y) {
-        uint16x8_t lo = vmull_u8( skvx::to_vext(x.lo), skvx::to_vext(y.lo) ),
-                   hi = vmull_u8( skvx::to_vext(x.hi), skvx::to_vext(y.hi) );
-        // TODO: why can't I get skvx::join() to generate the same code as this?
-        skvx::Vec<16,uint16_t> r;
-        memcpy(&r.lo, &lo, sizeof(lo));
-        memcpy(&r.hi, &hi, sizeof(hi));
-        return r;
-    }
-#else
-    // Nothing special when we don't have NEON... just cast up to 16-bit and multiply.
-    template <int N>
-    static inline skvx::Vec<N,uint16_t> mull(skvx::Vec<N,uint8_t> x, skvx::Vec<N,uint8_t> y) {
-        return skvx::cast<uint16_t>(x)
-             * skvx::cast<uint16_t>(y);
-    }
-#endif
-
 // Blend constant color over count src pixels, writing into dst.
 inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
     constexpr int N = 4;  // 8, 16 also reasonable choices
diff --git a/tests/SkVxTest.cpp b/tests/SkVxTest.cpp
index cf453bc..270d64c 100644
--- a/tests/SkVxTest.cpp
+++ b/tests/SkVxTest.cpp
@@ -16,9 +16,10 @@
 using double4 = skvx::Vec<4,double>;
 using double8 = skvx::Vec<8,double>;
 
-using byte2 = skvx::Vec<2,uint8_t>;
-using byte4 = skvx::Vec<4,uint8_t>;
-using byte8 = skvx::Vec<8,uint8_t>;
+using byte2  = skvx::Vec< 2,uint8_t>;
+using byte4  = skvx::Vec< 4,uint8_t>;
+using byte8  = skvx::Vec< 8,uint8_t>;
+using byte16 = skvx::Vec<16,uint8_t>;
 
 using int2 = skvx::Vec<2,int32_t>;
 using int4 = skvx::Vec<4,int32_t>;
@@ -166,4 +167,15 @@
             }
         }
     }
+
+    for (int x = 0; x < 256; x++)
+    for (int y = 0; y < 256; y++) {
+        uint16_t xy = x*y;
+
+        // Make sure to cover implementation cases N=8, N<8, and N>8.
+        REPORTER_ASSERT(r, all(mull(byte2 (x), byte2 (y)) == xy));
+        REPORTER_ASSERT(r, all(mull(byte4 (x), byte4 (y)) == xy));
+        REPORTER_ASSERT(r, all(mull(byte8 (x), byte8 (y)) == xy));
+        REPORTER_ASSERT(r, all(mull(byte16(x), byte16(y)) == xy));
+    }
 }