Implement re-ordering for PLS atomic draws

Atomic mode needs a barrier between overlapping draws. Implement re-ordering of non-overlapping draws to minimize barriers.

Add a massively SIMD "IntersectionBoard" class that assigns a minimal "groupIndex" to each draw based on its bounding box. Sort draws based on groupIndex, plus lower priority items like draw type and texture hash. Issue draws in the new order with barriers between different groupIndex values.

Discard draws ahead of time that are empty or offscreen. (IntersectionBoard doesn't support these.)

Now that the draw logic is getting more complex, store PLSDraws in a smart pointer that guarantees we never miss a call to releaseRefs().

New simd features:
  - reduce_add, reduce_min, reduce_max, reduce_and, reduce_or
  - >2 arguments to join
  - zip()
  - more typedefs

Diffs=
d67aeac4d Implement re-ordering for PLS atomic draws (#6417)

Co-authored-by: Chris Dalton <99840794+csmartdalton@users.noreply.github.com>
diff --git a/.rive_head b/.rive_head
index eead455..2e9e314 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-a3788ed8ad14d8046faec27d34c9483ddb7d7cb3
+d67aeac4d7643cdb60f7d22c9b2c0005e7838ed9
diff --git a/include/rive/math/aabb.hpp b/include/rive/math/aabb.hpp
index 1749209..d6efcd6 100644
--- a/include/rive/math/aabb.hpp
+++ b/include/rive/math/aabb.hpp
@@ -2,7 +2,6 @@
 #define _RIVE_AABB_HPP_
 
 #include "rive/span.hpp"
-#include "rive/math/mat2d.hpp"
 #include "rive/math/vec2d.hpp"
 #include <cstddef>
 #include <limits>
@@ -67,6 +66,7 @@
     AABB offset(float dx, float dy) const { return {minX + dx, minY + dy, maxX + dx, maxY + dy}; }
 
     IAABB round() const;
+    IAABB roundOut() const; // Rounds out to integer bounds that fully contain the rectangle.
 
     ///
     /// Initialize an AABB to values that represent an invalid/collapsed
diff --git a/include/rive/math/mat2d.hpp b/include/rive/math/mat2d.hpp
index 6c347c6..8576cd1 100644
--- a/include/rive/math/mat2d.hpp
+++ b/include/rive/math/mat2d.hpp
@@ -1,6 +1,7 @@
 #ifndef _RIVE_MAT2D_HPP_
 #define _RIVE_MAT2D_HPP_
 
+#include "rive/math/aabb.hpp"
 #include "rive/math/vec2d.hpp"
 #include <array>
 #include <cstddef>
@@ -44,6 +45,11 @@
     // Sets dst[i] = M * pts[i] for i in 0..n-1.
     void mapPoints(Vec2D dst[], const Vec2D pts[], size_t n) const;
 
+    // Computes a bounding box that would tightly contain the given points if they were to all be
+    // transformed by this matrix.
+    AABB mapBoundingBox(const Vec2D pts[], size_t n) const;
+    AABB mapBoundingBox(const AABB&) const;
+
     // If returns true, result holds the inverse.
     // If returns false, result is unchnaged.
     bool invert(Mat2D* result) const;
diff --git a/include/rive/math/math_types.hpp b/include/rive/math/math_types.hpp
index 3dd14bf..f9b28d7 100644
--- a/include/rive/math/math_types.hpp
+++ b/include/rive/math/math_types.hpp
@@ -16,6 +16,7 @@
 namespace math
 {
 constexpr float PI = 3.14159265f;
+constexpr float SQRT2 = 1.41421356f;
 constexpr float EPSILON = 1.f / (1 << 12); // Common threshold for detecting values near zero.
 
 RIVE_MAYBE_UNUSED inline bool nearly_zero(float a, float tolerance = EPSILON)
diff --git a/include/rive/math/simd.hpp b/include/rive/math/simd.hpp
index 097a06c..b2d6e21 100644
--- a/include/rive/math/simd.hpp
+++ b/include/rive/math/simd.hpp
@@ -16,7 +16,7 @@
 
 // #define RIVE_SIMD_PERF_WARNINGS
 
-#include "rive/rive_types.hpp"
+#include <algorithm>
 #include <cassert>
 #include <limits>
 #include <math.h>
@@ -32,6 +32,22 @@
 #include <arm_neon.h>
 #endif
 
+#if defined(__GNUC__) || defined(__clang__)
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+#define SIMD_ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#define __has_builtin(x) 0
+#define SIMD_ALWAYS_INLINE inline
+#endif
+
+#if __has_builtin(__builtin_memcpy)
+#define SIMD_INLINE_MEMCPY __builtin_memcpy
+#else
+#define SIMD_INLINE_MEMCPY memcpy
+#endif
+
 // SIMD math can expect conformant IEEE 754 behavior for NaN and Inf.
 static_assert(std::numeric_limits<float>::is_iec559,
               "Conformant IEEE 754 behavior for NaN and Inf is required.");
@@ -82,7 +98,7 @@
 //
 
 // Returns true if all elements in x are equal to 0.
-template <typename T, int N> RIVE_ALWAYS_INLINE bool any(gvec<T, N> x)
+template <typename T, int N> SIMD_ALWAYS_INLINE bool any(gvec<T, N> x)
 {
 #if __has_builtin(__builtin_reduce_or)
     return __builtin_reduce_or(x);
@@ -101,7 +117,7 @@
 }
 
 // Returns true if all elements in x are equal to ~0.
-template <typename T, int N> RIVE_ALWAYS_INLINE bool all(gvec<T, N> x)
+template <typename T, int N> SIMD_ALWAYS_INLINE bool all(gvec<T, N> x)
 {
 #if __has_builtin(__builtin_reduce_and)
     return __builtin_reduce_and(x);
@@ -117,7 +133,7 @@
 template <typename T,
           int N,
           typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
-RIVE_ALWAYS_INLINE gvec<typename boolean_mask_type<T>::type, N> isnan(gvec<T, N> x)
+SIMD_ALWAYS_INLINE gvec<typename boolean_mask_type<T>::type, N> isnan(gvec<T, N> x)
 {
     return ~(x == x);
 }
@@ -132,7 +148,7 @@
 
 // Elementwise ternary expression: "_if ? _then : _else" for each component.
 template <typename T, int N>
-RIVE_ALWAYS_INLINE gvec<T, N> if_then_else(gvec<typename boolean_mask_type<T>::type, N> _if,
+SIMD_ALWAYS_INLINE gvec<T, N> if_then_else(gvec<typename boolean_mask_type<T>::type, N> _if,
                                            gvec<T, N> _then,
                                            gvec<T, N> _else)
 {
@@ -152,7 +168,7 @@
 
 // Similar to std::min(), with a noteworthy difference:
 // If a[i] or b[i] is NaN and the other is not, returns whichever is _not_ NaN.
-template <typename T, int N> RIVE_ALWAYS_INLINE gvec<T, N> min(gvec<T, N> a, gvec<T, N> b)
+template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> min(gvec<T, N> a, gvec<T, N> b)
 {
 #if __has_builtin(__builtin_elementwise_min)
     return __builtin_elementwise_min(a, b);
@@ -167,7 +183,7 @@
 
 // Similar to std::max(), with a noteworthy difference:
 // If a[i] or b[i] is NaN and the other is not, returns whichever is _not_ NaN.
-template <typename T, int N> RIVE_ALWAYS_INLINE gvec<T, N> max(gvec<T, N> a, gvec<T, N> b)
+template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> max(gvec<T, N> a, gvec<T, N> b)
 {
 #if __has_builtin(__builtin_elementwise_max)
     return __builtin_elementwise_max(a, b);
@@ -187,14 +203,14 @@
 //   Ignores hi and/or lo if they are NaN.
 //
 template <typename T, int N>
-RIVE_ALWAYS_INLINE gvec<T, N> clamp(gvec<T, N> x, gvec<T, N> lo, gvec<T, N> hi)
+SIMD_ALWAYS_INLINE gvec<T, N> clamp(gvec<T, N> x, gvec<T, N> lo, gvec<T, N> hi)
 {
     return min(max(lo, x), hi);
 }
 
 // Returns the absolute value of x per element, with one exception:
 // If x[i] is an integer type and equal to the minimum representable value, returns x[i].
-template <typename T, int N> RIVE_ALWAYS_INLINE gvec<T, N> abs(gvec<T, N> x)
+template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> abs(gvec<T, N> x)
 {
 #if __has_builtin(__builtin_elementwise_abs)
     return __builtin_elementwise_abs(x);
@@ -206,30 +222,99 @@
 #endif
 }
 
-template <typename T, int N> RIVE_ALWAYS_INLINE T sum(gvec<T, N> x)
+template <typename T, int N>
+SIMD_ALWAYS_INLINE typename std::enable_if<std::is_integral<T>::value, T>::type reduce_add(
+    gvec<T, N> x)
 {
+#if __has_builtin(__builtin_reduce_add)
+    return __builtin_reduce_add(x);
+#else
+#ifdef RIVE_SIMD_PERF_WARNINGS
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
+#endif
+    T s = x[0];
+    for (int i = 1; i < N; ++i)
+        s += x[i];
+    return s;
+#endif
+}
+
+template <typename T, int N>
+SIMD_ALWAYS_INLINE typename std::enable_if<!std::is_integral<T>::value, T>::type reduce_add(
+    gvec<T, N> x)
+{
+#ifdef RIVE_SIMD_PERF_WARNINGS
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
+#endif
     T s = x[0];
     for (int i = 1; i < N; ++i)
         s += x[i];
     return s;
 }
 
-// We can use __builtin_reduce_add for integer types.
-#if __has_builtin(__builtin_reduce_add)
-template <int N> RIVE_ALWAYS_INLINE int32_t sum(gvec<int32_t, N> x)
+template <typename T, int N> SIMD_ALWAYS_INLINE T reduce_min(gvec<T, N> x)
 {
-    return __builtin_reduce_add(x);
+#if __has_builtin(__builtin_reduce_and)
+    return __builtin_reduce_min(x);
+#else
+#ifdef RIVE_SIMD_PERF_WARNINGS
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
+#endif
+    T reduced = x[0];
+    for (int i = 1; i < N; ++i)
+        reduced = std::min(reduced, x[i]);
+    return reduced;
+#endif
 }
 
-template <int N> RIVE_ALWAYS_INLINE uint32_t sum(gvec<uint32_t, N> x)
+template <typename T, int N> SIMD_ALWAYS_INLINE T reduce_max(gvec<T, N> x)
 {
-    return __builtin_reduce_add(x);
-}
+#if __has_builtin(__builtin_reduce_and)
+    return __builtin_reduce_max(x);
+#else
+#ifdef RIVE_SIMD_PERF_WARNINGS
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
 #endif
+    T reduced = x[0];
+    for (int i = 1; i < N; ++i)
+        reduced = std::max(reduced, x[i]);
+    return reduced;
+#endif
+}
+
+template <typename T, int N> SIMD_ALWAYS_INLINE T reduce_and(gvec<T, N> x)
+{
+#if __has_builtin(__builtin_reduce_and)
+    return __builtin_reduce_and(x);
+#else
+#ifdef RIVE_SIMD_PERF_WARNINGS
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
+#endif
+    T reduced = x[0];
+    for (int i = 1; i < N; ++i)
+        reduced &= x[i];
+    return reduced;
+#endif
+}
+
+template <typename T, int N> SIMD_ALWAYS_INLINE T reduce_or(gvec<T, N> x)
+{
+#if __has_builtin(__builtin_reduce_and)
+    return __builtin_reduce_or(x);
+#else
+#ifdef RIVE_SIMD_PERF_WARNINGS
+#pragma message("performance: __builtin_reduce_and() not supported. Consider updating clang.")
+#endif
+    T reduced = x[0];
+    for (int i = 1; i < N; ++i)
+        reduced |= x[i];
+    return reduced;
+#endif
+}
 
 ////// Floating Point Functions //////
 
-template <int N> RIVE_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x)
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> floor(gvec<float, N> x)
 {
 #if __has_builtin(__builtin_elementwise_floor)
     return __builtin_elementwise_floor(x);
@@ -244,7 +329,7 @@
 #endif
 }
 
-template <int N> RIVE_ALWAYS_INLINE gvec<float, N> ceil(gvec<float, N> x)
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> ceil(gvec<float, N> x)
 {
 #if __has_builtin(__builtin_elementwise_ceil)
     return __builtin_elementwise_ceil(x);
@@ -259,7 +344,7 @@
 }
 
 // IEEE compliant sqrt.
-template <int N> RIVE_ALWAYS_INLINE gvec<float, N> sqrt(gvec<float, N> x)
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> sqrt(gvec<float, N> x)
 {
     // There isn't an elementwise builtin for sqrt. We define architecture-specific specializations
     // of this function later.
@@ -269,53 +354,53 @@
 }
 
 #ifdef __SSE__
-template <> RIVE_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
 {
     __m128 _x;
-    RIVE_INLINE_MEMCPY(&_x, &x, sizeof(float) * 4);
+    SIMD_INLINE_MEMCPY(&_x, &x, sizeof(float) * 4);
     _x = _mm_sqrt_ps(_x);
-    RIVE_INLINE_MEMCPY(&x, &_x, sizeof(float) * 4);
+    SIMD_INLINE_MEMCPY(&x, &_x, sizeof(float) * 4);
     return x;
 }
 
-template <> RIVE_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
 {
     __m128 _x;
-    RIVE_INLINE_MEMCPY(&_x, &x, sizeof(float) * 2);
+    SIMD_INLINE_MEMCPY(&_x, &x, sizeof(float) * 2);
     _x = _mm_sqrt_ps(_x);
-    RIVE_INLINE_MEMCPY(&x, &_x, sizeof(float) * 2);
+    SIMD_INLINE_MEMCPY(&x, &_x, sizeof(float) * 2);
     return x;
 }
 #endif
 
 #ifdef __aarch64__
-template <> RIVE_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
 {
     float32x4_t _x;
-    RIVE_INLINE_MEMCPY(&_x, &x, sizeof(float) * 4);
+    SIMD_INLINE_MEMCPY(&_x, &x, sizeof(float) * 4);
     _x = vsqrtq_f32(_x);
-    RIVE_INLINE_MEMCPY(&x, &_x, sizeof(float) * 4);
+    SIMD_INLINE_MEMCPY(&x, &_x, sizeof(float) * 4);
     return x;
 }
 
-template <> RIVE_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
 {
     float32x2_t _x;
-    RIVE_INLINE_MEMCPY(&_x, &x, sizeof(float) * 2);
+    SIMD_INLINE_MEMCPY(&_x, &x, sizeof(float) * 2);
     _x = vsqrt_f32(_x);
-    RIVE_INLINE_MEMCPY(&x, &_x, sizeof(float) * 2);
+    SIMD_INLINE_MEMCPY(&x, &_x, sizeof(float) * 2);
     return x;
 }
 #endif
 
 // This will only be present when building with Emscripten and "-msimd128".
 #if __has_builtin(__builtin_wasm_sqrt_f32x4)
-template <> RIVE_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
+template <> SIMD_ALWAYS_INLINE gvec<float, 4> sqrt(gvec<float, 4> x)
 {
     return __builtin_wasm_sqrt_f32x4(x);
 }
 
-template <> RIVE_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
+template <> SIMD_ALWAYS_INLINE gvec<float, 2> sqrt(gvec<float, 2> x)
 {
     gvec<float, 4> _x{x.x, x.y};
     _x = __builtin_wasm_sqrt_f32x4(_x);
@@ -329,7 +414,7 @@
 //
 // See: https://stackoverflow.com/a/36387954
 #define SIMD_FAST_ACOS_MAX_ERROR 0.0167552f // .96 degrees
-template <int N> RIVE_ALWAYS_INLINE gvec<float, N> fast_acos(gvec<float, N> x)
+template <int N> SIMD_ALWAYS_INLINE gvec<float, N> fast_acos(gvec<float, N> x)
 {
     constexpr static float a = -0.939115566365855f;
     constexpr static float b = 0.9217841528914573f;
@@ -344,7 +429,7 @@
 
 ////// Type conversion //////
 
-template <typename U, typename T, int N> RIVE_ALWAYS_INLINE gvec<U, N> cast(gvec<T, N> x)
+template <typename U, typename T, int N> SIMD_ALWAYS_INLINE gvec<U, N> cast(gvec<T, N> x)
 {
     gvec<U, N> y{};
     for (int i = 0; i < N; ++i)
@@ -354,57 +439,57 @@
 
 ////// Loading and storing //////
 
-template <typename T, int N> RIVE_ALWAYS_INLINE gvec<T, N> load(const void* ptr)
+template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N> load(const void* ptr)
 {
     gvec<T, N> ret;
-    RIVE_INLINE_MEMCPY(&ret, ptr, sizeof(T) * N);
+    SIMD_INLINE_MEMCPY(&ret, ptr, sizeof(T) * N);
     return ret;
 }
-RIVE_ALWAYS_INLINE gvec<float, 2> load2f(const void* ptr) { return load<float, 2>(ptr); }
-RIVE_ALWAYS_INLINE gvec<float, 4> load4f(const void* ptr) { return load<float, 4>(ptr); }
-RIVE_ALWAYS_INLINE gvec<int32_t, 2> load2i(const void* ptr) { return load<int32_t, 2>(ptr); }
-RIVE_ALWAYS_INLINE gvec<int32_t, 4> load4i(const void* ptr) { return load<int32_t, 4>(ptr); }
-RIVE_ALWAYS_INLINE gvec<uint32_t, 2> load2ui(const void* ptr) { return load<uint32_t, 2>(ptr); }
-RIVE_ALWAYS_INLINE gvec<uint32_t, 4> load4ui(const void* ptr) { return load<uint32_t, 4>(ptr); }
+SIMD_ALWAYS_INLINE gvec<float, 2> load2f(const void* ptr) { return load<float, 2>(ptr); }
+SIMD_ALWAYS_INLINE gvec<float, 4> load4f(const void* ptr) { return load<float, 4>(ptr); }
+SIMD_ALWAYS_INLINE gvec<int32_t, 2> load2i(const void* ptr) { return load<int32_t, 2>(ptr); }
+SIMD_ALWAYS_INLINE gvec<int32_t, 4> load4i(const void* ptr) { return load<int32_t, 4>(ptr); }
+SIMD_ALWAYS_INLINE gvec<uint32_t, 2> load2ui(const void* ptr) { return load<uint32_t, 2>(ptr); }
+SIMD_ALWAYS_INLINE gvec<uint32_t, 4> load4ui(const void* ptr) { return load<uint32_t, 4>(ptr); }
 
-template <typename T, int N> RIVE_ALWAYS_INLINE void store(void* dst, gvec<T, N> vec)
+template <typename T, int N> SIMD_ALWAYS_INLINE void store(void* dst, gvec<T, N> vec)
 {
-    RIVE_INLINE_MEMCPY(dst, &vec, sizeof(T) * N);
+    SIMD_INLINE_MEMCPY(dst, &vec, sizeof(T) * N);
 }
 
 ////// Column-major (transposed) loads //////
 
 #if defined(__ARM_NEON__) || defined(__aarch64__)
-RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+SIMD_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
 load4x4f(const float* matrix)
 {
     float32x4x4_t m = vld4q_f32(matrix);
     gvec<float, 4> c0, c1, c2, c3;
-    RIVE_INLINE_MEMCPY(&c0, &m.val[0], sizeof(c0));
-    RIVE_INLINE_MEMCPY(&c1, &m.val[1], sizeof(c1));
-    RIVE_INLINE_MEMCPY(&c2, &m.val[2], sizeof(c2));
-    RIVE_INLINE_MEMCPY(&c3, &m.val[3], sizeof(c3));
+    SIMD_INLINE_MEMCPY(&c0, &m.val[0], sizeof(c0));
+    SIMD_INLINE_MEMCPY(&c1, &m.val[1], sizeof(c1));
+    SIMD_INLINE_MEMCPY(&c2, &m.val[2], sizeof(c2));
+    SIMD_INLINE_MEMCPY(&c3, &m.val[3], sizeof(c3));
     return {c0, c1, c2, c3};
 }
 #elif defined(__SSE__)
-RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+SIMD_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
 load4x4f(const float* m)
 {
     __m128 r0, r1, r2, r3;
-    RIVE_INLINE_MEMCPY(&r0, m + 4 * 0, sizeof(r0));
-    RIVE_INLINE_MEMCPY(&r1, m + 4 * 1, sizeof(r1));
-    RIVE_INLINE_MEMCPY(&r2, m + 4 * 2, sizeof(r2));
-    RIVE_INLINE_MEMCPY(&r3, m + 4 * 3, sizeof(r3));
+    SIMD_INLINE_MEMCPY(&r0, m + 4 * 0, sizeof(r0));
+    SIMD_INLINE_MEMCPY(&r1, m + 4 * 1, sizeof(r1));
+    SIMD_INLINE_MEMCPY(&r2, m + 4 * 2, sizeof(r2));
+    SIMD_INLINE_MEMCPY(&r3, m + 4 * 3, sizeof(r3));
     _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
     gvec<float, 4> c0, c1, c2, c3;
-    RIVE_INLINE_MEMCPY(&c0, &r0, sizeof(c0));
-    RIVE_INLINE_MEMCPY(&c1, &r1, sizeof(c1));
-    RIVE_INLINE_MEMCPY(&c2, &r2, sizeof(c2));
-    RIVE_INLINE_MEMCPY(&c3, &r3, sizeof(c3));
+    SIMD_INLINE_MEMCPY(&c0, &r0, sizeof(c0));
+    SIMD_INLINE_MEMCPY(&c1, &r1, sizeof(c1));
+    SIMD_INLINE_MEMCPY(&c2, &r2, sizeof(c2));
+    SIMD_INLINE_MEMCPY(&c3, &r3, sizeof(c3));
     return {c0, c1, c2, c3};
 }
 #else
-RIVE_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
+SIMD_ALWAYS_INLINE std::tuple<gvec<float, 4>, gvec<float, 4>, gvec<float, 4>, gvec<float, 4>>
 load4x4f(const float* m)
 {
     gvec<float, 4> c0 = {m[0], m[4], m[8], m[12]};
@@ -416,22 +501,57 @@
 #endif
 
 template <typename T, int M, int N>
-RIVE_ALWAYS_INLINE gvec<T, M + N> join(gvec<T, M> a, gvec<T, N> b)
+SIMD_ALWAYS_INLINE gvec<T, M + N> join(gvec<T, M> a, gvec<T, N> b)
 {
     T data[M + N];
-    RIVE_INLINE_MEMCPY(data, &a, sizeof(T) * M);
-    RIVE_INLINE_MEMCPY(data + M, &b, sizeof(T) * N);
+    SIMD_INLINE_MEMCPY(data, &a, sizeof(T) * M);
+    SIMD_INLINE_MEMCPY(data + M, &b, sizeof(T) * N);
     return load<T, M + N>(data);
 }
 
+template <typename T, int M, int N, int O>
+SIMD_ALWAYS_INLINE gvec<T, M + N + O> join(gvec<T, M> a, gvec<T, N> b, gvec<T, O> c)
+{
+    T data[M + N + O];
+    SIMD_INLINE_MEMCPY(data, &a, sizeof(T) * M);
+    SIMD_INLINE_MEMCPY(data + M, &b, sizeof(T) * N);
+    SIMD_INLINE_MEMCPY(data + M + N, &c, sizeof(T) * O);
+    return load<T, M + N + O>(data);
+}
+
+template <typename T, int M, int N, int O, int P>
+SIMD_ALWAYS_INLINE gvec<T, M + N + O + P> join(gvec<T, M> a,
+                                               gvec<T, N> b,
+                                               gvec<T, O> c,
+                                               gvec<T, P> d)
+{
+    T data[M + N + O + P];
+    SIMD_INLINE_MEMCPY(data, &a, sizeof(T) * M);
+    SIMD_INLINE_MEMCPY(data + M, &b, sizeof(T) * N);
+    SIMD_INLINE_MEMCPY(data + M + N, &c, sizeof(T) * O);
+    SIMD_INLINE_MEMCPY(data + M + N + O, &d, sizeof(T) * P);
+    return load<T, M + N + O + P>(data);
+}
+
+template <typename T, int N> SIMD_ALWAYS_INLINE gvec<T, N * 2> zip(gvec<T, N> a, gvec<T, N> b)
+{
+    gvec<T, N * 2> ret{};
+    for (int i = 0; i < N; ++i)
+    {
+        ret[i * 2] = a[i];
+        ret[i * 2 + 1] = b[i];
+    }
+    return ret;
+}
+
 ////// Basic linear algebra //////
 
-template <typename T, int N> RIVE_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
+template <typename T, int N> SIMD_ALWAYS_INLINE T dot(gvec<T, N> a, gvec<T, N> b)
 {
-    return sum(a * b);
+    return reduce_add(a * b);
 }
 
-RIVE_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
+SIMD_ALWAYS_INLINE float cross(gvec<float, 2> a, gvec<float, 2> b)
 {
     auto c = a * b.yx;
     return c.x - c.y;
@@ -445,7 +565,7 @@
 // structure seems to get better precision for things like chopping cubics on exact cusp points than
 // "a*(1 - t) + b*t" (which would return exactly b when t == 1).
 template <int N>
-RIVE_ALWAYS_INLINE gvec<float, N> mix(gvec<float, N> a, gvec<float, N> b, gvec<float, N> t)
+SIMD_ALWAYS_INLINE gvec<float, N> mix(gvec<float, N> a, gvec<float, N> b, gvec<float, N> t)
 {
     assert(simd::all(0.f <= t && t < 1.f));
     return (b - a) * t + a;
@@ -453,7 +573,7 @@
 
 // Linearly interpolates between a and b, returning precisely 'a' if t==0 and precisely 'b' if t==1.
 template <int N>
-RIVE_ALWAYS_INLINE gvec<float, N> precise_mix(gvec<float, N> a, gvec<float, N> b, gvec<float, N> t)
+SIMD_ALWAYS_INLINE gvec<float, N> precise_mix(gvec<float, N> a, gvec<float, N> b, gvec<float, N> t)
 {
     return a * (1.f - t) + b * t;
 }
@@ -473,6 +593,32 @@
 template <int N> using uvec = simd::gvec<uint32_t, N>;
 using uint2 = uvec<2>;
 using uint4 = uvec<4>;
+
+using int8x8 = simd::gvec<int8_t, 8>;
+using int8x16 = simd::gvec<int8_t, 16>;
+using int8x32 = simd::gvec<int8_t, 32>;
+
+using uint8x8 = simd::gvec<uint8_t, 8>;
+using uint8x16 = simd::gvec<uint8_t, 16>;
+using uint8x32 = simd::gvec<uint8_t, 32>;
+
+using int16x4 = simd::gvec<int16_t, 4>;
+using int16x8 = simd::gvec<int16_t, 8>;
+using int16x16 = simd::gvec<int16_t, 16>;
+
+using uint16x4 = simd::gvec<uint16_t, 4>;
+using uint16x8 = simd::gvec<uint16_t, 8>;
+using uint16x16 = simd::gvec<uint16_t, 16>;
+
+using int64x2 = simd::gvec<int64_t, 2>;
+using int64x4 = simd::gvec<int64_t, 4>;
+
+using uint64x2 = simd::gvec<uint64_t, 2>;
+using uint64x4 = simd::gvec<uint64_t, 4>;
+
 } // namespace rive
 
+#undef SIMD_INLINE_MEMCPY
+#undef SIMD_ALWAYS_INLINE
+
 #endif
diff --git a/include/rive/math/simd_gvec_polyfill.hpp b/include/rive/math/simd_gvec_polyfill.hpp
index 50af70a..358156b 100644
--- a/include/rive/math/simd_gvec_polyfill.hpp
+++ b/include/rive/math/simd_gvec_polyfill.hpp
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <initializer_list>
 #include <stdint.h>
+#include <cstring>
 
 namespace rive
 {
@@ -192,35 +193,38 @@
 #undef DECL_UNARY_OP
 
 #define DECL_ARITHMETIC_OP(_OP_)                                                                   \
-    template <typename T, int N, Swizzle Z0, Swizzle Z1>                                           \
-    gvec<T, N, Z0>& operator _OP_##=(gvec<T, N, Z0>& a, gvec<T, N, Z1> b)                          \
+    template <typename T, typename U, int N, Swizzle Z0, Swizzle Z1>                               \
+    gvec<T, N, Z0>& operator _OP_##=(gvec<T, N, Z0>& a, gvec<U, N, Z1> b)                          \
     {                                                                                              \
         for (int i = 0; i < N; ++i)                                                                \
             a[i] _OP_## = b[i];                                                                    \
         return a;                                                                                  \
     }                                                                                              \
-    template <typename T, int N, Swizzle Z> gvec<T, N, Z>& operator _OP_##=(gvec<T, N, Z>& a, T b) \
+    template <typename T, typename U, int N, Swizzle Z>                                            \
+    gvec<T, N, Z>& operator _OP_##=(gvec<T, N, Z>& a, U b)                                         \
     {                                                                                              \
         for (int i = 0; i < N; ++i)                                                                \
             a[i] _OP_## = b;                                                                       \
         return a;                                                                                  \
     }                                                                                              \
-    template <typename T, int N, Swizzle Z0, Swizzle Z1>                                           \
-    gvec<T, N> operator _OP_(gvec<T, N, Z0> a, gvec<T, N, Z1> b)                                   \
+    template <typename T, typename U, int N, Swizzle Z0, Swizzle Z1>                               \
+    gvec<T, N> operator _OP_(gvec<T, N, Z0> a, gvec<U, N, Z1> b)                                   \
     {                                                                                              \
         gvec<T, N> ret;                                                                            \
         for (int i = 0; i < N; ++i)                                                                \
             ret[i] = a[i] _OP_ b[i];                                                               \
         return ret;                                                                                \
     }                                                                                              \
-    template <typename T, int N, Swizzle Z> gvec<T, N> operator _OP_(gvec<T, N, Z> a, T b)         \
+    template <typename T, typename U, int N, Swizzle Z>                                            \
+    gvec<T, N> operator _OP_(gvec<T, N, Z> a, U b)                                                 \
     {                                                                                              \
         gvec<T, N> ret;                                                                            \
         for (int i = 0; i < N; ++i)                                                                \
             ret[i] = a[i] _OP_ b;                                                                  \
         return ret;                                                                                \
     }                                                                                              \
-    template <typename T, int N, Swizzle Z> gvec<T, N> operator _OP_(T a, gvec<T, N, Z> b)         \
+    template <typename T, typename U, int N, Swizzle Z>                                            \
+    gvec<T, N> operator _OP_(T a, gvec<U, N, Z> b)                                                 \
     {                                                                                              \
         gvec<T, N> ret;                                                                            \
         for (int i = 0; i < N; ++i)                                                                \
@@ -232,6 +236,7 @@
 DECL_ARITHMETIC_OP(-);
 DECL_ARITHMETIC_OP(*);
 DECL_ARITHMETIC_OP(/);
+DECL_ARITHMETIC_OP(%);
 DECL_ARITHMETIC_OP(|);
 DECL_ARITHMETIC_OP(&);
 DECL_ARITHMETIC_OP(^);
@@ -324,7 +329,11 @@
     }
 
 ENABLE_SWIZZLE1(abs)
-ENABLE_SWIZZLE_REDUCE(sum)
+ENABLE_SWIZZLE_REDUCE(reduce_add)
+ENABLE_SWIZZLE_REDUCE(reduce_min)
+ENABLE_SWIZZLE_REDUCE(reduce_max)
+ENABLE_SWIZZLE_REDUCE(reduce_and)
+ENABLE_SWIZZLE_REDUCE(reduce_or)
 ENABLE_SWIZZLE1F(floor)
 ENABLE_SWIZZLE1F(ceil)
 ENABLE_SWIZZLE1F(sqrt)
diff --git a/include/rive/shapes/paint/blend_mode.hpp b/include/rive/shapes/paint/blend_mode.hpp
index 0fbc4b3..3b27246 100644
--- a/include/rive/shapes/paint/blend_mode.hpp
+++ b/include/rive/shapes/paint/blend_mode.hpp
@@ -2,7 +2,7 @@
 #define _RIVE_BLEND_MODE_HPP_
 namespace rive
 {
-enum class BlendMode : unsigned int
+enum class BlendMode : unsigned char
 {
     srcOver = 3,
     screen = 14,
@@ -22,4 +22,4 @@
     luminosity = 28
 };
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/math/aabb.cpp b/src/math/aabb.cpp
index 65cb3e8..23688fc 100644
--- a/src/math/aabb.cpp
+++ b/src/math/aabb.cpp
@@ -1,4 +1,6 @@
+#include "rive/math/math_types.hpp"
 #include "rive/math/aabb.hpp"
+#include "rive/math/simd.hpp"
 #include <algorithm>
 #include <cmath>
 
@@ -41,6 +43,14 @@
     };
 }
 
+IAABB AABB::roundOut() const
+{
+    float4 bounds = simd::load4f(this);
+    bounds.xy = simd::floor(bounds.xy);
+    bounds.zw = simd::ceil(bounds.zw);
+    return math::bit_cast<IAABB>(simd::cast<int32_t>(bounds));
+}
+
 void AABB::expandTo(AABB& out, const Vec2D& point) { expandTo(out, point.x, point.y); }
 
 void AABB::expandTo(AABB& out, float x, float y)
@@ -69,4 +79,4 @@
     out.minY = std::min(a.minY, b.minY);
     out.maxX = std::max(a.maxX, b.maxX);
     out.maxY = std::max(a.maxY, b.maxY);
-}
\ No newline at end of file
+}
diff --git a/src/math/hit_test.cpp b/src/math/hit_test.cpp
index 77a4c50..56fe574 100644
--- a/src/math/hit_test.cpp
+++ b/src/math/hit_test.cpp
@@ -4,6 +4,7 @@
 
 #include "rive/math/hit_test.hpp"
 
+#include "rive/math/mat2d.hpp"
 #include <algorithm>
 #include <assert.h>
 #include <cmath>
diff --git a/src/math/mat2d.cpp b/src/math/mat2d.cpp
index 194dd02..223e1b6 100644
--- a/src/math/mat2d.cpp
+++ b/src/math/mat2d.cpp
@@ -1,3 +1,4 @@
+#include "rive/math/math_types.hpp"
 #include "rive/math/mat2d.hpp"
 #include "rive/math/simd.hpp"
 #include "rive/math/transform_components.hpp"
@@ -87,6 +88,75 @@
     }
 }
 
+AABB Mat2D::mapBoundingBox(const Vec2D pts[], size_t n) const
+{
+    if (n == 0)
+    {
+        return {0, 0, 0, 0};
+    }
+
+    size_t i = 0;
+    float4 scale = float2{m_buffer[0], m_buffer[3]}.xyxy;
+    float4 skew = simd::load2f(&m_buffer[1]).yxyx;
+    float4 mins = std::numeric_limits<float>::infinity();
+    float4 maxes = -std::numeric_limits<float>::infinity();
+    if (simd::all(skew.xy == 0.f))
+    {
+        // Scale + translate matrix.
+        if (n & 1)
+        {
+            float2 p = simd::load2f(pts);
+            p = scale.xy * p;
+            mins.xy = maxes.xy = p;
+            ++i;
+        }
+        for (; i < n; i += 2)
+        {
+            float4 p = simd::load4f(pts + i);
+            p = scale * p;
+            mins = simd::min(p, mins);
+            maxes = simd::max(p, maxes);
+        }
+    }
+    else
+    {
+        // Affine matrix.
+        if (n & 1)
+        {
+            float2 p = simd::load2f(pts);
+            float2 p_ = skew.xy * p.yx;
+            p_ = scale.xy * p + p_;
+            mins.xy = maxes.xy = p_;
+            ++i;
+        }
+        for (; i < n; i += 2)
+        {
+            float4 p = simd::load4f(pts + i);
+            float4 p_ = skew * p.yxwz;
+            p_ = scale * p + p_;
+            mins = simd::min(p_, mins);
+            maxes = simd::max(p_, maxes);
+        }
+    }
+
+    float4 bbox = simd::join(simd::min(mins.xy, mins.zw), simd::max(maxes.xy, maxes.zw));
+    assert(simd::all(bbox.xy <= bbox.zw));
+
+    float4 trans = simd::load2f(&m_buffer[4]).xyxy;
+    bbox += trans;
+
+    return math::bit_cast<AABB>(bbox);
+}
+
+AABB Mat2D::mapBoundingBox(const AABB& aabb) const
+{
+    Vec2D pts[4] = {{aabb.left(), aabb.top()},
+                    {aabb.right(), aabb.top()},
+                    {aabb.right(), aabb.bottom()},
+                    {aabb.left(), aabb.bottom()}};
+    return mapBoundingBox(pts, 4);
+}
+
 bool Mat2D::invert(Mat2D* result) const
 {
     float aa = m_buffer[0], ab = m_buffer[1], ac = m_buffer[2], ad = m_buffer[3], atx = m_buffer[4],
diff --git a/src/shapes/paint/color.cpp b/src/shapes/paint/color.cpp
index 1c4cb1c..6b6e1e7 100644
--- a/src/shapes/paint/color.cpp
+++ b/src/shapes/paint/color.cpp
@@ -1,6 +1,7 @@
 #include "rive/shapes/paint/color.hpp"
 
 #include "rive/math/simd.hpp"
+#include <algorithm>
 #include <stdio.h>
 
 namespace rive
diff --git a/test/mat2d_test.cpp b/test/mat2d_test.cpp
index c21df78..aa904ea 100644
--- a/test/mat2d_test.cpp
+++ b/test/mat2d_test.cpp
@@ -165,4 +165,73 @@
         // REQUIRE(minScale / min >= gCloseScaleTol);
     }
 }
+
+TEST_CASE("mapBoundingBox", "[Mat2D]")
+{
+    const std::vector<Vec2D> testPts{{0, 0}, {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, -1}};
+    std::vector<Vec2D> mappedPts(testPts.size());
+    auto checkMatrix = [&](Mat2D m) {
+        // Check zero points.
+        AABB mappedBbox = m.mapBoundingBox(nullptr, 0);
+        CHECK(mappedBbox.left() == 0);
+        CHECK(mappedBbox.top() == 0);
+        CHECK(mappedBbox.right() == 0);
+        CHECK(mappedBbox.bottom() == 0);
+
+        // Find a single point boinding box.
+        for (const Vec2D pt : testPts)
+        {
+            mappedBbox = m.mapBoundingBox(&pt, 1);
+            Vec2D mappedPt;
+            m.mapPoints(&mappedPt, &pt, 1);
+            CHECK(mappedBbox.left() == Approx(mappedPt.x));
+            CHECK(mappedBbox.top() == Approx(mappedPt.y));
+            CHECK(mappedBbox.right() == Approx(mappedPt.x));
+            CHECK(mappedBbox.bottom() == Approx(mappedPt.y));
+        }
+
+        // Check n - 1 points (ensures we test one even-length and one odd-length array).
+        m.mapPoints(mappedPts.data(), testPts.data() + 1, testPts.size() - 1);
+        mappedBbox = m.mapBoundingBox(testPts.data() + 1, testPts.size() - 1);
+        AABB testBbox = {1e9f, 1e9f, -1e9f, -1e9f};
+        for (size_t i = 0; i < testPts.size() - 1; ++i)
+        {
+            AABB::expandTo(testBbox, mappedPts[i]);
+        }
+        CHECK(mappedBbox.left() == Approx(testBbox.left()));
+        CHECK(mappedBbox.top() == Approx(testBbox.top()));
+        CHECK(mappedBbox.right() == Approx(testBbox.right()));
+        CHECK(mappedBbox.bottom() == Approx(testBbox.bottom()));
+
+        // Check n points.
+        m.mapPoints(mappedPts.data(), testPts.data(), testPts.size());
+        mappedBbox = m.mapBoundingBox(testPts.data(), testPts.size());
+        testBbox = {1e9f, 1e9f, -1e9f, -1e9f};
+        for (size_t i = 0; i < testPts.size(); ++i)
+        {
+            AABB::expandTo(testBbox, mappedPts[i]);
+        }
+        CHECK(mappedBbox.left() == Approx(testBbox.left()));
+        CHECK(mappedBbox.top() == Approx(testBbox.top()));
+        CHECK(mappedBbox.right() == Approx(testBbox.right()));
+        CHECK(mappedBbox.bottom() == Approx(testBbox.bottom()));
+
+        // Check mapping of a bounding box.
+        Vec2D bboxPts[4] = {{0, 0}, {0, 1}, {1, 1}, {1, 0}};
+        AABB mappedBboxFromPts = m.mapBoundingBox(bboxPts, 4);
+        AABB mappedBboxFromAABB = m.mapBoundingBox(AABB{0, 0, 1, 1});
+        CHECK(mappedBboxFromPts.left() == Approx(mappedBboxFromAABB.left()));
+        CHECK(mappedBboxFromPts.top() == Approx(mappedBboxFromAABB.top()));
+        CHECK(mappedBboxFromPts.right() == Approx(mappedBboxFromAABB.right()));
+        CHECK(mappedBboxFromPts.bottom() == Approx(mappedBboxFromAABB.bottom()));
+    };
+    checkMatrix(Mat2D());
+    checkMatrix(Mat2D(1, 0, 0, 1, 2, -3));
+    checkMatrix(Mat2D(4, 0, 0, -5, 0, 0));
+    checkMatrix(Mat2D(4, 0, 0, 5, -6, 7));
+    checkMatrix(Mat2D(0, 8, 9, 0, 10, 11));
+    checkMatrix(Mat2D(-12, -13, -14, -15, -16, -17));
+    checkMatrix(Mat2D(18, 19, 20, 21, 22, 23));
+    checkMatrix(Mat2D(-25, 26, 27, -28, 29, -30));
+}
 } // namespace rive
diff --git a/test/simd_test.cpp b/test/simd_test.cpp
index 57d334b..cf8d471 100644
--- a/test/simd_test.cpp
+++ b/test/simd_test.cpp
@@ -381,34 +381,84 @@
                   int2{std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::min()}));
 }
 
-// Check simd::sum.
-TEST_CASE("sum", "[simd]")
+// Check simd::reduce* methods.
+TEST_CASE("reduce", "[simd]")
 {
     {
         float4 v = {1, 2, 3, 4};
-        CHECK(simd::sum(v) == 10);
-        CHECK(simd::sum(v.zwxy) == 10);
-        CHECK(simd::sum(v.xyz) == 6);
-        CHECK(simd::sum(v.yz) == 5);
-        CHECK(simd::sum(v.xy.yxyx) == 6);
+        CHECK(simd::reduce_add(v) == 10);
+        CHECK(simd::reduce_add(v.zwxy) == 10);
+        CHECK(simd::reduce_add(v.xyz) == 6);
+        CHECK(simd::reduce_add(v.yz) == 5);
+        CHECK(simd::reduce_add(v.xy.yxyx) == 6);
+        CHECK(simd::reduce_min(v) == 1);
+        CHECK(simd::reduce_min(v.zwxy) == 1);
+        CHECK(simd::reduce_min(v.xyz) == 1);
+        CHECK(simd::reduce_min(v.yz) == 2);
+        CHECK(simd::reduce_min(v.xy.yxyx) == 1);
+        CHECK(simd::reduce_max(v) == 4);
+        CHECK(simd::reduce_max(v.zwxy) == 4);
+        CHECK(simd::reduce_max(v.xyz) == 3);
+        CHECK(simd::reduce_max(v.yz) == 3);
+        CHECK(simd::reduce_max(v.xy.yxyx) == 2);
     }
 
     {
         int4 v = {1, 2, 3, 4};
-        CHECK(simd::sum(v) == 10);
-        CHECK(simd::sum(v.zwxy) == 10);
-        CHECK(simd::sum(v.xyz) == 6);
-        CHECK(simd::sum(v.yz) == 5);
-        CHECK(simd::sum(v.xy.yxyx) == 6);
+        CHECK(simd::reduce_add(v) == 10);
+        CHECK(simd::reduce_add(v.zwxy) == 10);
+        CHECK(simd::reduce_add(v.xyz) == 6);
+        CHECK(simd::reduce_add(v.yz) == 5);
+        CHECK(simd::reduce_add(v.xy.yxyx) == 6);
+        CHECK(simd::reduce_min(v) == 1);
+        CHECK(simd::reduce_min(v.zwxy) == 1);
+        CHECK(simd::reduce_min(v.xyz) == 1);
+        CHECK(simd::reduce_min(v.yz) == 2);
+        CHECK(simd::reduce_min(v.xy.yxyx) == 1);
+        CHECK(simd::reduce_max(v) == 4);
+        CHECK(simd::reduce_max(v.zwxy) == 4);
+        CHECK(simd::reduce_max(v.xyz) == 3);
+        CHECK(simd::reduce_max(v.yz) == 3);
+        CHECK(simd::reduce_max(v.xy.yxyx) == 2);
+        CHECK(simd::reduce_and(v) == 0);
+        CHECK(simd::reduce_and(v.zwxy) == 0);
+        CHECK(simd::reduce_and(v.xyz) == 0);
+        CHECK(simd::reduce_and(v.yz) == 2);
+        CHECK(simd::reduce_and(v.xy.yxyx) == 0);
+        CHECK(simd::reduce_or(v) == 7);
+        CHECK(simd::reduce_or(v.zwxy) == 7);
+        CHECK(simd::reduce_or(v.xyz) == 3);
+        CHECK(simd::reduce_or(v.yz) == 3);
+        CHECK(simd::reduce_or(v.xy.yxyx) == 3);
     }
 
     {
         uint4 v = {1, 2, 3, 4};
-        CHECK(simd::sum(v) == 10);
-        CHECK(simd::sum(v.zwxy) == 10);
-        CHECK(simd::sum(v.xyz) == 6);
-        CHECK(simd::sum(v.yz) == 5);
-        CHECK(simd::sum(v.xy.yxyx) == 6);
+        CHECK(simd::reduce_add(v) == 10);
+        CHECK(simd::reduce_add(v.zwxy) == 10);
+        CHECK(simd::reduce_add(v.xyz) == 6);
+        CHECK(simd::reduce_add(v.yz) == 5);
+        CHECK(simd::reduce_add(v.xy.yxyx) == 6);
+        CHECK(simd::reduce_min(v) == 1);
+        CHECK(simd::reduce_min(v.zwxy) == 1);
+        CHECK(simd::reduce_min(v.xyz) == 1);
+        CHECK(simd::reduce_min(v.yz) == 2);
+        CHECK(simd::reduce_min(v.xy.yxyx) == 1);
+        CHECK(simd::reduce_max(v) == 4);
+        CHECK(simd::reduce_max(v.zwxy) == 4);
+        CHECK(simd::reduce_max(v.xyz) == 3);
+        CHECK(simd::reduce_max(v.yz) == 3);
+        CHECK(simd::reduce_max(v.xy.yxyx) == 2);
+        CHECK(simd::reduce_and(v) == 0);
+        CHECK(simd::reduce_and(v.zwxy) == 0);
+        CHECK(simd::reduce_and(v.xyz) == 0);
+        CHECK(simd::reduce_and(v.yz) == 2);
+        CHECK(simd::reduce_and(v.xy.yxyx) == 0);
+        CHECK(simd::reduce_or(v) == 7);
+        CHECK(simd::reduce_or(v.zwxy) == 7);
+        CHECK(simd::reduce_or(v.xyz) == 3);
+        CHECK(simd::reduce_or(v.yz) == 3);
+        CHECK(simd::reduce_or(v.xy.yxyx) == 3);
     }
 }
 
@@ -595,6 +645,24 @@
 {
     CHECK_ALL((simd::join(int2{1, 2}, int4{3, 4, 5, 6}) == ivec<6>{1, 2, 3, 4, 5, 6}));
     CHECK_ALL((simd::join(vec<1>{1}, vec<3>{2, 3, 4}) == float4{1, 2, 3, 4}));
+    CHECK_ALL((simd::join(vec<1>{1}, vec<2>{2, 3}, vec<3>{4, 5, 6}) == vec<6>{1, 2, 3, 4, 5, 6}));
+    CHECK_ALL((simd::join(vec<1>{1}, vec<2>{2, 3}, vec<3>{4, 5, 6}, float4{7, 8, 9, 10}) ==
+               vec<10>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+    uint8x8 a = 3, b = 9, c = 3, d = 100;
+    CHECK_ALL((simd::join(a, b, c, d) == uint8x32{3, 3, 3,   3,   3,   3,   3,   3,   9,   9,  9,
+                                                  9, 9, 9,   9,   9,   3,   3,   3,   3,   3,  3,
+                                                  3, 3, 100, 100, 100, 100, 100, 100, 100, 100}));
+}
+
+// Check simd::zip
+TEST_CASE("zip", "[simd]")
+{
+    CHECK_ALL((simd::zip(int4{1, 2, 3, 4}, int4{5, 6, 7, 8}) == ivec<8>{1, 5, 2, 6, 3, 7, 4, 8}));
+    CHECK_ALL((simd::zip(simd::gvec<uint8_t, 8>{1, 2, 3, 4, 5, 6, 7, 8},
+                         simd::gvec<uint8_t, 8>{9, 10, 11, 12, 13, 14, 15, 16}) ==
+               simd::gvec<uint8_t, 16>{1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16}));
+    CHECK_ALL(
+        (simd::zip(float4{1, 2, 3, 4}, float4{5, 6, 7, 8}) == vec<8>{1, 5, 2, 6, 3, 7, 4, 8}));
 }
 
 template <int N> static vec<N> mix_reference_impl(vec<N> a, vec<N> b, float t)