feat(scripting): first-class Mat4 type with SIMD multiply (#12445) a076a8abde

Co-authored-by: Luigi Rosso <luigi-rosso@users.noreply.github.com>
diff --git a/.rive_head b/.rive_head
index 1fa9a0b..1bf2a3e 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-66b9d31e8ff505fe8c9800e865e933a452abc5b4
+a076a8abde3ac7c5af06e53cc55f0fedfb6b4bf4
diff --git a/include/rive/lua/rive_lua_libs.hpp b/include/rive/lua/rive_lua_libs.hpp
index a79dc66..3527173 100644
--- a/include/rive/lua/rive_lua_libs.hpp
+++ b/include/rive/lua/rive_lua_libs.hpp
@@ -10,6 +10,7 @@
 #include "rive/math/raw_path.hpp"
 #include "rive/renderer.hpp"
 #include "rive/math/vec2d.hpp"
+#include "rive/math/mat4.hpp"
 #include "rive/math/contour_measure.hpp"
 #include "rive/math/path_measure.hpp"
 #include "rive/shapes/paint/image_sampler.hpp"
@@ -332,6 +333,12 @@
 
     // Image decode
     decodeImage,
+
+    // Mat4
+    transpose,
+    transformPoint,
+    transformVec4,
+    writeToBuffer,
 };
 
 struct ScriptedMat2D
@@ -354,6 +361,21 @@
 static_assert(std::is_trivially_destructible<ScriptedMat2D>::value,
               "ScriptedMat2D must be trivially destructible");
 
+struct ScriptedMat4
+{
+    static constexpr uint8_t luaTag = LUA_T_COUNT + 62;
+    static constexpr const char* luaName = "Mat4";
+    static constexpr bool hasMetatable = true;
+
+    ScriptedMat4() {}
+    ScriptedMat4(const Mat4& mat) : value(mat) {}
+
+    rive::Mat4 value;
+};
+
+static_assert(std::is_trivially_destructible<ScriptedMat4>::value,
+              "ScriptedMat4 must be trivially destructible");
+
 class ScriptedPathCommand
 {
 public:
diff --git a/include/rive/math/mat4.hpp b/include/rive/math/mat4.hpp
new file mode 100644
index 0000000..3b0351d
--- /dev/null
+++ b/include/rive/math/mat4.hpp
@@ -0,0 +1,288 @@
+#ifndef _RIVE_MAT4_HPP_
+#define _RIVE_MAT4_HPP_
+
+#include "rive/math/simd.hpp"
+#include "rive/math/vec2d.hpp"
+#include <array>
+#include <cmath>
+#include <cstddef>
+
+namespace rive
+{
+// Column-major 4x4 single-precision matrix. The 64-byte storage can be
+// uploaded directly to a GPU uniform buffer.
+//
+// Column 0 = m[0..3], Column 1 = m[4..7], Column 2 = m[8..11], Column 3 =
+// m[12..15].
+class Mat4
+{
+public:
+    constexpr Mat4() :
+        m_buffer{{1.f,
+                  0.f,
+                  0.f,
+                  0.f,
+                  0.f,
+                  1.f,
+                  0.f,
+                  0.f,
+                  0.f,
+                  0.f,
+                  1.f,
+                  0.f,
+                  0.f,
+                  0.f,
+                  0.f,
+                  1.f}}
+    {}
+
+    constexpr Mat4(float c0x,
+                   float c0y,
+                   float c0z,
+                   float c0w,
+                   float c1x,
+                   float c1y,
+                   float c1z,
+                   float c1w,
+                   float c2x,
+                   float c2y,
+                   float c2z,
+                   float c2w,
+                   float c3x,
+                   float c3y,
+                   float c3z,
+                   float c3w) :
+        m_buffer{{c0x,
+                  c0y,
+                  c0z,
+                  c0w,
+                  c1x,
+                  c1y,
+                  c1z,
+                  c1w,
+                  c2x,
+                  c2y,
+                  c2z,
+                  c2w,
+                  c3x,
+                  c3y,
+                  c3z,
+                  c3w}}
+    {}
+
+    const float* values() const { return m_buffer.data(); }
+    float* values() { return m_buffer.data(); }
+
+    float& operator[](size_t i) { return m_buffer[i]; }
+    float operator[](size_t i) const { return m_buffer[i]; }
+
+    static Mat4 identity() { return Mat4(); }
+
+    static Mat4 fromTranslation(float x, float y, float z)
+    {
+        Mat4 m;
+        m.m_buffer[12] = x;
+        m.m_buffer[13] = y;
+        m.m_buffer[14] = z;
+        return m;
+    }
+
+    static Mat4 fromScale(float sx, float sy, float sz)
+    {
+        Mat4 m;
+        m.m_buffer[0] = sx;
+        m.m_buffer[5] = sy;
+        m.m_buffer[10] = sz;
+        return m;
+    }
+
+    static Mat4 fromRotationX(float rad)
+    {
+        float c = std::cos(rad), s = std::sin(rad);
+        Mat4 m;
+        m.m_buffer[5] = c;
+        m.m_buffer[6] = s;
+        m.m_buffer[9] = -s;
+        m.m_buffer[10] = c;
+        return m;
+    }
+
+    static Mat4 fromRotationY(float rad)
+    {
+        float c = std::cos(rad), s = std::sin(rad);
+        Mat4 m;
+        m.m_buffer[0] = c;
+        m.m_buffer[2] = -s;
+        m.m_buffer[8] = s;
+        m.m_buffer[10] = c;
+        return m;
+    }
+
+    static Mat4 fromRotationZ(float rad)
+    {
+        float c = std::cos(rad), s = std::sin(rad);
+        Mat4 m;
+        m.m_buffer[0] = c;
+        m.m_buffer[1] = s;
+        m.m_buffer[4] = -s;
+        m.m_buffer[5] = c;
+        return m;
+    }
+
+    // Right-handed perspective. Maps view-space z=[-near, -far] to NDC z in
+    // either [0, 1] (default, depthZeroToOne=true) or [-1, 1].
+    static Mat4 perspective(float fovYRadians,
+                            float aspect,
+                            float near_,
+                            float far_,
+                            bool depthZeroToOne = true)
+    {
+        float f = 1.f / std::tan(fovYRadians * 0.5f);
+        float nf = 1.f / (near_ - far_);
+        Mat4 m{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+        m.m_buffer[0] = f / aspect;
+        m.m_buffer[5] = f;
+        if (depthZeroToOne)
+        {
+            m.m_buffer[10] = far_ * nf;
+            m.m_buffer[14] = far_ * near_ * nf;
+        }
+        else
+        {
+            m.m_buffer[10] = (far_ + near_) * nf;
+            m.m_buffer[14] = 2.f * far_ * near_ * nf;
+        }
+        m.m_buffer[11] = -1.f;
+        return m;
+    }
+
+    // SIMD: out = lhs * rhs. Both column-major.
+    static Mat4 multiply(const Mat4& lhs, const Mat4& rhs)
+    {
+        // Each output column j is a linear combination of lhs's columns
+        // weighted by rhs's column j.
+        const float* L = lhs.m_buffer.data();
+        const float* R = rhs.m_buffer.data();
+        float4 c0 = simd::load4f(L);
+        float4 c1 = simd::load4f(L + 4);
+        float4 c2 = simd::load4f(L + 8);
+        float4 c3 = simd::load4f(L + 12);
+
+        Mat4 out;
+        for (int j = 0; j < 4; ++j)
+        {
+            const float* rcol = R + j * 4;
+            float4 result =
+                c0 * rcol[0] + c1 * rcol[1] + c2 * rcol[2] + c3 * rcol[3];
+            simd::store(out.m_buffer.data() + j * 4, result);
+        }
+        return out;
+    }
+
+    Mat4 operator*(const Mat4& rhs) const { return multiply(*this, rhs); }
+
+    // SIMD: out = M * (x, y, z, w). Returns a 4-component vector (xyzw).
+    void transformVec4(float out[4], float x, float y, float z, float w) const
+    {
+        float4 c0 = simd::load4f(m_buffer.data());
+        float4 c1 = simd::load4f(m_buffer.data() + 4);
+        float4 c2 = simd::load4f(m_buffer.data() + 8);
+        float4 c3 = simd::load4f(m_buffer.data() + 12);
+        simd::store(out, c0 * x + c1 * y + c2 * z + c3 * w);
+    }
+
+    Mat4 transposed() const
+    {
+        Mat4 t;
+        for (int r = 0; r < 4; ++r)
+            for (int c = 0; c < 4; ++c)
+                t.m_buffer[r * 4 + c] = m_buffer[c * 4 + r];
+        return t;
+    }
+
+    // Returns true and writes inverse if invertible. Otherwise returns false
+    // and `result` is unchanged. Cofactor method.
+    bool invert(Mat4* result) const
+    {
+        const float* m = m_buffer.data();
+        float inv[16];
+
+        inv[0] = m[5] * m[10] * m[15] - m[5] * m[11] * m[14] -
+                 m[9] * m[6] * m[15] + m[9] * m[7] * m[14] +
+                 m[13] * m[6] * m[11] - m[13] * m[7] * m[10];
+        inv[4] = -m[4] * m[10] * m[15] + m[4] * m[11] * m[14] +
+                 m[8] * m[6] * m[15] - m[8] * m[7] * m[14] -
+                 m[12] * m[6] * m[11] + m[12] * m[7] * m[10];
+        inv[8] = m[4] * m[9] * m[15] - m[4] * m[11] * m[13] -
+                 m[8] * m[5] * m[15] + m[8] * m[7] * m[13] +
+                 m[12] * m[5] * m[11] - m[12] * m[7] * m[9];
+        inv[12] = -m[4] * m[9] * m[14] + m[4] * m[10] * m[13] +
+                  m[8] * m[5] * m[14] - m[8] * m[6] * m[13] -
+                  m[12] * m[5] * m[10] + m[12] * m[6] * m[9];
+        inv[1] = -m[1] * m[10] * m[15] + m[1] * m[11] * m[14] +
+                 m[9] * m[2] * m[15] - m[9] * m[3] * m[14] -
+                 m[13] * m[2] * m[11] + m[13] * m[3] * m[10];
+        inv[5] = m[0] * m[10] * m[15] - m[0] * m[11] * m[14] -
+                 m[8] * m[2] * m[15] + m[8] * m[3] * m[14] +
+                 m[12] * m[2] * m[11] - m[12] * m[3] * m[10];
+        inv[9] = -m[0] * m[9] * m[15] + m[0] * m[11] * m[13] +
+                 m[8] * m[1] * m[15] - m[8] * m[3] * m[13] -
+                 m[12] * m[1] * m[11] + m[12] * m[3] * m[9];
+        inv[13] = m[0] * m[9] * m[14] - m[0] * m[10] * m[13] -
+                  m[8] * m[1] * m[14] + m[8] * m[2] * m[13] +
+                  m[12] * m[1] * m[10] - m[12] * m[2] * m[9];
+        inv[2] = m[1] * m[6] * m[15] - m[1] * m[7] * m[14] -
+                 m[5] * m[2] * m[15] + m[5] * m[3] * m[14] +
+                 m[13] * m[2] * m[7] - m[13] * m[3] * m[6];
+        inv[6] = -m[0] * m[6] * m[15] + m[0] * m[7] * m[14] +
+                 m[4] * m[2] * m[15] - m[4] * m[3] * m[14] -
+                 m[12] * m[2] * m[7] + m[12] * m[3] * m[6];
+        inv[10] = m[0] * m[5] * m[15] - m[0] * m[7] * m[13] -
+                  m[4] * m[1] * m[15] + m[4] * m[3] * m[13] +
+                  m[12] * m[1] * m[7] - m[12] * m[3] * m[5];
+        inv[14] = -m[0] * m[5] * m[14] + m[0] * m[6] * m[13] +
+                  m[4] * m[1] * m[14] - m[4] * m[2] * m[13] -
+                  m[12] * m[1] * m[6] + m[12] * m[2] * m[5];
+        inv[3] = -m[1] * m[6] * m[11] + m[1] * m[7] * m[10] +
+                 m[5] * m[2] * m[11] - m[5] * m[3] * m[10] -
+                 m[9] * m[2] * m[7] + m[9] * m[3] * m[6];
+        inv[7] = m[0] * m[6] * m[11] - m[0] * m[7] * m[10] -
+                 m[4] * m[2] * m[11] + m[4] * m[3] * m[10] +
+                 m[8] * m[2] * m[7] - m[8] * m[3] * m[6];
+        inv[11] = -m[0] * m[5] * m[11] + m[0] * m[7] * m[9] +
+                  m[4] * m[1] * m[11] - m[4] * m[3] * m[9] -
+                  m[8] * m[1] * m[7] + m[8] * m[3] * m[5];
+        inv[15] = m[0] * m[5] * m[10] - m[0] * m[6] * m[9] -
+                  m[4] * m[1] * m[10] + m[4] * m[2] * m[9] +
+                  m[8] * m[1] * m[6] - m[8] * m[2] * m[5];
+
+        float det =
+            m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12];
+        if (det == 0.f)
+            return false;
+        float invDet = 1.f / det;
+        for (int i = 0; i < 16; ++i)
+            (*result)[i] = inv[i] * invDet;
+        return true;
+    }
+
+private:
+    std::array<float, 16> m_buffer;
+};
+
+static_assert(std::is_trivially_destructible<Mat4>::value,
+              "Mat4 must be trivially destructible");
+static_assert(sizeof(Mat4) == 16 * sizeof(float),
+              "Mat4 must be 64 bytes (no padding)");
+
+inline bool operator==(const Mat4& a, const Mat4& b)
+{
+    for (size_t i = 0; i < 16; ++i)
+        if (a[i] != b[i])
+            return false;
+    return true;
+}
+inline bool operator!=(const Mat4& a, const Mat4& b) { return !(a == b); }
+
+} // namespace rive
+#endif
diff --git a/src/lua/math/lua_mat4.cpp b/src/lua/math/lua_mat4.cpp
new file mode 100644
index 0000000..402b64a
--- /dev/null
+++ b/src/lua/math/lua_mat4.cpp
@@ -0,0 +1,351 @@
+#ifdef WITH_RIVE_SCRIPTING
+#include "rive/math/mat4.hpp"
+#include "rive/lua/rive_lua_libs.hpp"
+#include <cstdlib>
+#include <cstring>
+
+using namespace rive;
+
+static ScriptedMat4* lua_pushmat4(lua_State* L, const Mat4& mat)
+{
+    return lua_newrive<ScriptedMat4>(L, mat);
+}
+
+static ScriptedMat4* lua_pushmat4(lua_State* L)
+{
+    return lua_newrive<ScriptedMat4>(L);
+}
+
+// Mat4.values(c0x, c0y, c0z, c0w, c1x, ..., c3w) — column-major.
+static int mat4_values(lua_State* L)
+{
+    auto out = lua_pushmat4(L);
+    float* m = out->value.values();
+    for (int i = 0; i < 16; ++i)
+    {
+        m[i] = float(luaL_checknumber(L, 1 + i));
+    }
+    return 1;
+}
+
+static int mat4_identity(lua_State* L)
+{
+    lua_pushmat4(L, Mat4::identity());
+    return 1;
+}
+
+static int mat4_fromTranslation(lua_State* L)
+{
+    float x = float(luaL_checknumber(L, 1));
+    float y = float(luaL_checknumber(L, 2));
+    float z = float(luaL_checknumber(L, 3));
+    lua_pushmat4(L, Mat4::fromTranslation(x, y, z));
+    return 1;
+}
+
+static int mat4_fromScale(lua_State* L)
+{
+    float sx = float(luaL_checknumber(L, 1));
+    float sy = lua_isnumber(L, 2) ? float(luaL_checknumber(L, 2)) : sx;
+    float sz = lua_isnumber(L, 3) ? float(luaL_checknumber(L, 3)) : sx;
+    lua_pushmat4(L, Mat4::fromScale(sx, sy, sz));
+    return 1;
+}
+
+static int mat4_fromRotationX(lua_State* L)
+{
+    lua_pushmat4(L, Mat4::fromRotationX(float(luaL_checknumber(L, 1))));
+    return 1;
+}
+
+static int mat4_fromRotationY(lua_State* L)
+{
+    lua_pushmat4(L, Mat4::fromRotationY(float(luaL_checknumber(L, 1))));
+    return 1;
+}
+
+static int mat4_fromRotationZ(lua_State* L)
+{
+    lua_pushmat4(L, Mat4::fromRotationZ(float(luaL_checknumber(L, 1))));
+    return 1;
+}
+
+static int mat4_perspective(lua_State* L)
+{
+    float fov = float(luaL_checknumber(L, 1));
+    float aspect = float(luaL_checknumber(L, 2));
+    float n = float(luaL_checknumber(L, 3));
+    float f = float(luaL_checknumber(L, 4));
+    lua_pushmat4(L, Mat4::perspective(fov, aspect, n, f, /*zeroToOne=*/true));
+    return 1;
+}
+
+// In-place: Mat4.multiply(out, a, b)  ->  out = a * b. Returns out.
+// Avoids per-call userdata allocation in tight loops.
+static int mat4_static_multiply(lua_State* L)
+{
+    auto out = lua_torive<ScriptedMat4>(L, 1);
+    auto a = lua_torive<ScriptedMat4>(L, 2);
+    auto b = lua_torive<ScriptedMat4>(L, 3);
+    out->value = Mat4::multiply(a->value, b->value);
+    lua_pushvalue(L, 1);
+    return 1;
+}
+
+static int mat4_static_invert(lua_State* L)
+{
+    auto out = lua_torive<ScriptedMat4>(L, 1);
+    auto in = lua_torive<ScriptedMat4>(L, 2);
+    lua_pushboolean(L, in->value.invert(&out->value));
+    return 1;
+}
+
+// Field index lookup. Supports m11..m44 (row,col 1-indexed) and 1..16
+// (column-major linear index, 1-indexed).
+static int mat4_index_field(lua_State* L,
+                            ScriptedMat4* mat,
+                            const char* name,
+                            size_t namelen)
+{
+    if (namelen == 3 && name[0] == 'm')
+    {
+        int row = name[1] - '0';
+        int col = name[2] - '0';
+        if (row >= 1 && row <= 4 && col >= 1 && col <= 4)
+        {
+            // m[row][col] 1-indexed; column-major storage means
+            // index = (col-1)*4 + (row-1).
+            lua_pushnumber(L, mat->value[(col - 1) * 4 + (row - 1)]);
+            return 1;
+        }
+    }
+    if (namelen >= 1 && namelen <= 2)
+    {
+        char* end = nullptr;
+        long n = std::strtol(name, &end, 10);
+        if (end && *end == '\0' && n >= 1 && n <= 16)
+        {
+            lua_pushnumber(L, mat->value[n - 1]);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int mat4_newindex_field(lua_State* L,
+                               ScriptedMat4* mat,
+                               const char* name,
+                               size_t namelen,
+                               float value)
+{
+    if (namelen == 3 && name[0] == 'm')
+    {
+        int row = name[1] - '0';
+        int col = name[2] - '0';
+        if (row >= 1 && row <= 4 && col >= 1 && col <= 4)
+        {
+            mat->value[(col - 1) * 4 + (row - 1)] = value;
+            return 0;
+        }
+    }
+    if (namelen >= 1 && namelen <= 2)
+    {
+        char* end = nullptr;
+        long n = std::strtol(name, &end, 10);
+        if (end && *end == '\0' && n >= 1 && n <= 16)
+        {
+            mat->value[n - 1] = value;
+            return 0;
+        }
+    }
+    return -1;
+}
+
+static int mat4_index(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    size_t namelen = 0;
+    const char* name = luaL_checklstring(L, 2, &namelen);
+    if (mat4_index_field(L, mat, name, namelen) == 1)
+        return 1;
+    luaL_error(L,
+               "'%s' is not a valid index of %s",
+               name,
+               ScriptedMat4::luaName);
+    return 0;
+}
+
+static int mat4_newindex(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    size_t namelen = 0;
+    const char* name = luaL_checklstring(L, 2, &namelen);
+    float value = float(luaL_checknumber(L, 3));
+    if (mat4_newindex_field(L, mat, name, namelen, value) == 0)
+        return 0;
+    luaL_error(L,
+               "'%s' is not a valid index of %s",
+               name,
+               ScriptedMat4::luaName);
+    return 0;
+}
+
+static int mat4_mul(lua_State* L)
+{
+    auto a = lua_torive<ScriptedMat4>(L, 1);
+    auto b = lua_torive<ScriptedMat4>(L, 2);
+    lua_pushmat4(L, Mat4::multiply(a->value, b->value));
+    return 1;
+}
+
+static int mat4_eq(lua_State* L)
+{
+    auto a = lua_torive<ScriptedMat4>(L, 1);
+    auto b = lua_torive<ScriptedMat4>(L, 2);
+    lua_pushboolean(L, a->value == b->value);
+    return 1;
+}
+
+static int mat4_invert(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    Mat4 result;
+    if (mat->value.invert(&result))
+    {
+        lua_pushmat4(L, result);
+        return 1;
+    }
+    lua_pushnil(L);
+    return 1;
+}
+
+static int mat4_transpose(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    lua_pushmat4(L, mat->value.transposed());
+    return 1;
+}
+
+// mat:transformPoint(x, y, z)  ->  vector(x', y', z')  (w=1, perspective
+// divide)
+static int mat4_transformPoint(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    float x = float(luaL_checknumber(L, 2));
+    float y = float(luaL_checknumber(L, 3));
+    float z = float(luaL_checknumber(L, 4));
+    float out[4];
+    mat->value.transformVec4(out, x, y, z, 1.f);
+    if (out[3] != 0.f && out[3] != 1.f)
+    {
+        float inv = 1.f / out[3];
+        lua_pushvector(L, out[0] * inv, out[1] * inv, out[2] * inv);
+    }
+    else
+    {
+        lua_pushvector(L, out[0], out[1], out[2]);
+    }
+    return 1;
+}
+
+// mat:transformVec4(x, y, z, w)  ->  x', y', z', w' (no perspective divide)
+// Useful for clip-space transforms where the caller wants the homogeneous w
+// preserved.
+static int mat4_transformVec4(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    float x = float(luaL_checknumber(L, 2));
+    float y = float(luaL_checknumber(L, 3));
+    float z = float(luaL_checknumber(L, 4));
+    float w = float(luaL_checknumber(L, 5));
+    float out[4];
+    mat->value.transformVec4(out, x, y, z, w);
+    lua_pushnumber(L, out[0]);
+    lua_pushnumber(L, out[1]);
+    lua_pushnumber(L, out[2]);
+    lua_pushnumber(L, out[3]);
+    return 4;
+}
+
+// mat:writeToBuffer(buf, byteOffset)  — direct 64-byte memcpy of the
+// column-major matrix into a Luau buffer (uniform-buffer-friendly).
+static int mat4_writeToBuffer(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    size_t bufLen = 0;
+    void* buf = luaL_checkbuffer(L, 2, &bufLen);
+    int off = int(luaL_checkinteger(L, 3));
+    if (off < 0 || size_t(off) + 64 > bufLen)
+    {
+        luaL_error(L, "Mat4:writeToBuffer offset out of range");
+        return 0;
+    }
+    std::memcpy(static_cast<uint8_t*>(buf) + off, mat->value.values(), 64);
+    return 0;
+}
+
+static int mat4_namecall(lua_State* L)
+{
+    int atom;
+    if (lua_namecallatom(L, &atom))
+    {
+        switch (atom)
+        {
+            case (int)LuaAtoms::invert:
+                return mat4_invert(L);
+            case (int)LuaAtoms::transpose:
+                return mat4_transpose(L);
+            case (int)LuaAtoms::transformPoint:
+                return mat4_transformPoint(L);
+            case (int)LuaAtoms::transformVec4:
+                return mat4_transformVec4(L);
+            case (int)LuaAtoms::writeToBuffer:
+                return mat4_writeToBuffer(L);
+        }
+    }
+    luaL_error(L,
+               "%s is not a valid method of %s",
+               luaL_checkstring(L, 1),
+               ScriptedMat4::luaName);
+    return 0;
+}
+
+static const luaL_Reg mat4StaticMethods[] = {
+    {"identity", mat4_identity},
+    {"values", mat4_values},
+    {"fromTranslation", mat4_fromTranslation},
+    {"fromScale", mat4_fromScale},
+    {"fromRotationX", mat4_fromRotationX},
+    {"fromRotationY", mat4_fromRotationY},
+    {"fromRotationZ", mat4_fromRotationZ},
+    {"perspective", mat4_perspective},
+    {"multiply", mat4_static_multiply},
+    {"invert", mat4_static_invert},
+    {nullptr, nullptr}};
+
+int luaopen_rive_mat4(lua_State* L)
+{
+    luaL_register(L, ScriptedMat4::luaName, mat4StaticMethods);
+    lua_register_rive<ScriptedMat4>(L);
+
+    lua_pushcfunction(L, mat4_index, nullptr);
+    lua_setfield(L, -2, "__index");
+
+    lua_pushcfunction(L, mat4_newindex, nullptr);
+    lua_setfield(L, -2, "__newindex");
+
+    lua_pushcfunction(L, mat4_mul, nullptr);
+    lua_setfield(L, -2, "__mul");
+
+    lua_pushcfunction(L, mat4_eq, nullptr);
+    lua_setfield(L, -2, "__eq");
+
+    lua_pushcfunction(L, mat4_namecall, nullptr);
+    lua_setfield(L, -2, "__namecall");
+
+    lua_setreadonly(L, -1, true);
+    lua_pop(L, 1); // pop metatable
+    return 1;
+}
+
+#endif
diff --git a/src/lua/math/lua_math.cpp b/src/lua/math/lua_math.cpp
index dbdf8f2..f7987c8 100644
--- a/src/lua/math/lua_math.cpp
+++ b/src/lua/math/lua_math.cpp
@@ -3,10 +3,12 @@
 
 int luaopen_rive_vector(lua_State* L);
 int luaopen_rive_mat2d(lua_State* L);
+int luaopen_rive_mat4(lua_State* L);
 int luaopen_rive_color(lua_State* L);
 
 static const lua_CFunction mathTypes[] = {luaopen_rive_vector,
                                           luaopen_rive_mat2d,
+                                          luaopen_rive_mat4,
                                           luaopen_rive_color};
 
 int luaopen_rive_math(lua_State* L)
diff --git a/src/lua/math/lua_vec2d.cpp b/src/lua/math/lua_vec2d.cpp
index 4b4d73e..d9cdd01 100644
--- a/src/lua/math/lua_vec2d.cpp
+++ b/src/lua/math/lua_vec2d.cpp
@@ -24,6 +24,13 @@
             case '2':
                 lua_pushnumber(L, vec[1]);
                 return 1;
+            case 'z':
+            case '3':
+                // Luau's vector type stores 3 components; .z is reachable
+                // intrinsically through the VM's named-axis fastpath, but
+                // numeric `[3]` indexing routes through this metamethod.
+                lua_pushnumber(L, vec[2]);
+                return 1;
             default:
                 break;
         }
diff --git a/src/lua/rive_lua_libs.cpp b/src/lua/rive_lua_libs.cpp
index 8963773..fcce41b 100644
--- a/src/lua/rive_lua_libs.cpp
+++ b/src/lua/rive_lua_libs.cpp
@@ -245,6 +245,11 @@
     {"onCancel", (int16_t)LuaAtoms::onCancel},
     {"getStatus", (int16_t)LuaAtoms::getStatus},
     {"decodeImage", (int16_t)LuaAtoms::decodeImage},
+    // Mat4
+    {"transpose", (int16_t)LuaAtoms::transpose},
+    {"transformPoint", (int16_t)LuaAtoms::transformPoint},
+    {"transformVec4", (int16_t)LuaAtoms::transformVec4},
+    {"writeToBuffer", (int16_t)LuaAtoms::writeToBuffer},
 };
 
 static const luaL_Reg lualibs[] = {
diff --git a/tests/unit_tests/runtime/scripting/scripting_mat4_test.cpp b/tests/unit_tests/runtime/scripting/scripting_mat4_test.cpp
new file mode 100644
index 0000000..3562c01
--- /dev/null
+++ b/tests/unit_tests/runtime/scripting/scripting_mat4_test.cpp
@@ -0,0 +1,287 @@
+// Tests for the Mat4 scripted type and a perf benchmark comparing the C++
+// SIMD-accelerated path against a pure-Luau buffer-based mat4 implementation
+// modelled on examples/SpinningCube.luau.
+
+#include "catch.hpp"
+#include "scripting_test_utilities.hpp"
+
+#include <chrono>
+#include <climits>
+#include <cstdio>
+
+using namespace rive;
+
+TEST_CASE("Mat4 identity has expected values", "[scripting]")
+{
+    CHECK(lua_tonumber(ScriptingTest("return Mat4.identity().m11").state(),
+                       -1) == 1.0);
+    CHECK(lua_tonumber(ScriptingTest("return Mat4.identity().m22").state(),
+                       -1) == 1.0);
+    CHECK(lua_tonumber(ScriptingTest("return Mat4.identity().m33").state(),
+                       -1) == 1.0);
+    CHECK(lua_tonumber(ScriptingTest("return Mat4.identity().m44").state(),
+                       -1) == 1.0);
+    CHECK(lua_tonumber(ScriptingTest("return Mat4.identity().m12").state(),
+                       -1) == 0.0);
+    CHECK(lua_tonumber(ScriptingTest("return Mat4.identity()[1]").state(),
+                       -1) == 1.0);
+    CHECK(lua_tonumber(ScriptingTest("return Mat4.identity()[6]").state(),
+                       -1) == 1.0);
+}
+
+TEST_CASE("Mat4.values stores column-major", "[scripting]")
+{
+    // Column-major: indices 1..4 are column 0, 5..8 column 1, ...
+    const char* src = "local m = Mat4.values(\n"
+                      "  1, 2, 3, 4,\n" // column 0
+                      "  5, 6, 7, 8,\n" // column 1
+                      "  9,10,11,12,\n" // column 2
+                      " 13,14,15,16)\n"
+                      "return m.m11, m.m21, m.m31, m.m41, m.m14, m.m44\n";
+    auto t = ScriptingTest(src, 6);
+    CHECK(lua_tonumber(t.state(), -6) == 1.0); // m11 = column 0, row 0
+    CHECK(lua_tonumber(t.state(), -5) == 2.0); // m21 = column 0, row 1
+    CHECK(lua_tonumber(t.state(), -4) == 3.0);
+    CHECK(lua_tonumber(t.state(), -3) == 4.0);
+    CHECK(lua_tonumber(t.state(), -2) == 13.0); // m14 = column 3, row 0
+    CHECK(lua_tonumber(t.state(), -1) == 16.0); // m44 = column 3, row 3
+}
+
+TEST_CASE("Mat4 translation transforms a point", "[scripting]")
+{
+    const char* src = "local m = Mat4.fromTranslation(10, 20, 30)\n"
+                      "local v = m:transformPoint(1, 2, 3)\n"
+                      "return v.x, v.y, v.z\n";
+    auto t = ScriptingTest(src, 3);
+    CHECK(lua_tonumber(t.state(), -3) == 11.0);
+    CHECK(lua_tonumber(t.state(), -2) == 22.0);
+    CHECK(lua_tonumber(t.state(), -1) == 33.0);
+}
+
+TEST_CASE("Mat4 transformVec4 returns homogeneous components", "[scripting]")
+{
+    // No perspective divide: w is preserved as the final return value.
+    const char* src = "local m = Mat4.fromTranslation(10, 20, 30)\n"
+                      "return m:transformVec4(1, 2, 3, 1)\n";
+    auto t = ScriptingTest(src, 4);
+    CHECK(lua_tonumber(t.state(), -4) == 11.0);
+    CHECK(lua_tonumber(t.state(), -3) == 22.0);
+    CHECK(lua_tonumber(t.state(), -2) == 33.0);
+    CHECK(lua_tonumber(t.state(), -1) == 1.0);
+}
+
+TEST_CASE("transformPoint result supports z and [3]", "[scripting]")
+{
+    // Pins down that the 3D Vector returned from a Mat4 transform is
+    // reachable via both .z (intrinsic VM fastpath) and v[3] (metamethod).
+    const char* src = "local m = Mat4.fromTranslation(10, 20, 30)\n"
+                      "local v = m:transformPoint(1, 2, 3)\n"
+                      "return v[1], v[2], v[3]\n";
+    auto t = ScriptingTest(src, 3);
+    CHECK(lua_tonumber(t.state(), -3) == 11.0);
+    CHECK(lua_tonumber(t.state(), -2) == 22.0);
+    CHECK(lua_tonumber(t.state(), -1) == 33.0);
+}
+
+TEST_CASE("Mat4 multiply composes transforms", "[scripting]")
+{
+    const char* src = "local t = Mat4.fromTranslation(10, 0, 0)\n"
+                      "local s = Mat4.fromScale(2, 2, 2)\n"
+                      "local m = t * s\n"
+                      // m * (1,1,1) = scale then translate => (2+10, 2, 2)
+                      "local v = m:transformPoint(1, 1, 1)\n"
+                      "return v.x, v.y, v.z\n";
+    auto t = ScriptingTest(src, 3);
+    CHECK(lua_tonumber(t.state(), -3) == 12.0);
+    CHECK(lua_tonumber(t.state(), -2) == 2.0);
+    CHECK(lua_tonumber(t.state(), -1) == 2.0);
+}
+
+TEST_CASE("Mat4 invert round-trips", "[scripting]")
+{
+    const char* src =
+        "local m = Mat4.fromTranslation(3, -4, 5) * Mat4.fromScale(2, 2, 2)\n"
+        "local inv = m:invert()\n"
+        "local r = m * inv\n"
+        "local id = Mat4.identity()\n"
+        // Compare diagonal — full equality may fail on FP rounding.
+        "return math.abs(r.m11 - 1) + math.abs(r.m22 - 1) + math.abs(r.m33 - 1) + math.abs(r.m44 - 1)\n";
+    auto t = ScriptingTest(src);
+    CHECK(lua_tonumber(t.state(), -1) < 1e-5);
+}
+
+TEST_CASE("Mat4.multiply writes in place", "[scripting]")
+{
+    // Verifies the alloc-free static API used in tight loops.
+    const char* src = "local out = Mat4.identity()\n"
+                      "local a = Mat4.fromTranslation(1, 2, 3)\n"
+                      "local b = Mat4.fromScale(4, 4, 4)\n"
+                      "Mat4.multiply(out, a, b)\n"
+                      "return out.m14, out.m24, out.m34, out.m11\n";
+    auto t = ScriptingTest(src, 4);
+    CHECK(lua_tonumber(t.state(), -4) == 1.0);
+    CHECK(lua_tonumber(t.state(), -3) == 2.0);
+    CHECK(lua_tonumber(t.state(), -2) == 3.0);
+    CHECK(lua_tonumber(t.state(), -1) == 4.0);
+}
+
+TEST_CASE("Mat4:writeToBuffer stores 64 bytes column-major", "[scripting]")
+{
+    const char* src =
+        "local m = Mat4.values(\n"
+        "  1, 2, 3, 4,  5, 6, 7, 8,  9,10,11,12, 13,14,15,16)\n"
+        "local buf = buffer.create(80)\n"
+        "m:writeToBuffer(buf, 16)\n"
+        "return buffer.readf32(buf, 16), buffer.readf32(buf, 16+4*4), buffer.readf32(buf, 16+15*4)\n";
+    auto t = ScriptingTest(src, 3);
+    CHECK(lua_tonumber(t.state(), -3) == 1.0);
+    CHECK(lua_tonumber(t.state(), -2) == 5.0);
+    CHECK(lua_tonumber(t.state(), -1) == 16.0);
+}
+
+namespace
+{
+// Pure-Luau reference implementation of mat4 multiply on Luau buffers.
+// Mirrors the m4mul() pattern used in examples/SpinningCube.luau.
+const char* kLuauBufferMatMulPrelude =
+    R"(
+local function m4get(buf: buffer, i: number): number
+    return buffer.readf32(buf, i * 4)
+end
+local function m4set(buf: buffer, i: number, v: number)
+    buffer.writef32(buf, i * 4, v)
+end
+local function m4identity(): buffer
+    local b = buffer.create(64)
+    m4set(b, 0, 1)
+    m4set(b, 5, 1)
+    m4set(b, 10, 1)
+    m4set(b, 15, 1)
+    return b
+end
+local function m4mul(out: buffer, a: buffer, b: buffer)
+    for col = 0, 3 do
+        for row = 0, 3 do
+            local sum: number = 0
+            for k = 0, 3 do
+                sum += m4get(a, k * 4 + row) * m4get(b, col * 4 + k)
+            end
+            m4set(out, col * 4 + row, sum)
+        end
+    end
+end
+)";
+} // namespace
+
+TEST_CASE("Mat4 perf — C++ vs Luau-buffer matmul", "[scripting][benchmark]")
+{
+    const int N = 20000;
+    const int WARMUP = 1;
+    const int RUNS = 3;
+
+    // Compile-and-run timing: ScriptingTest constructs a fresh VM, compiles,
+    // and runs the script. The compile/setup overhead is the same for every
+    // variant, so the cross-comparison is valid even though absolute numbers
+    // include startup time.
+    auto bestRun = [&](const char* src) -> long long {
+        long long best = LLONG_MAX;
+        for (int run = 0; run < WARMUP + RUNS; ++run)
+        {
+            auto t0 = std::chrono::high_resolution_clock::now();
+            ScriptingTest test(src, 0);
+            auto t1 = std::chrono::high_resolution_clock::now();
+            auto us =
+                std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0)
+                    .count();
+            if (run >= WARMUP && us < best)
+                best = us;
+        }
+        return best;
+    };
+
+    char src[4096];
+
+    // Variant A: Mat4 * Mat4 (C++, allocates a Mat4 each iteration)
+    snprintf(src,
+             sizeof(src),
+             "local a = Mat4.fromTranslation(1, 2, 3)\n"
+             "local b = Mat4.fromRotationZ(0.1)\n"
+             "local m = Mat4.identity()\n"
+             "for i = 1, %d do m = a * b end\n",
+             N);
+    long long cppMul = bestRun(src);
+
+    // Variant B: Mat4.multiply(out, a, b) (C++, in-place, no alloc)
+    snprintf(src,
+             sizeof(src),
+             "local a = Mat4.fromTranslation(1, 2, 3)\n"
+             "local b = Mat4.fromRotationZ(0.1)\n"
+             "local out = Mat4.identity()\n"
+             "for i = 1, %d do Mat4.multiply(out, a, b) end\n",
+             N);
+    long long cppInPlace = bestRun(src);
+
+    // Variant C: pure-Luau buffer matmul (the SpinningCube approach)
+    char luauSrc[4096];
+    snprintf(
+        luauSrc,
+        sizeof(luauSrc),
+        "%s\n"
+        "local a = m4identity()\n"
+        "local b = m4identity()\n"
+        // Plant a few non-zero entries so the inner loop does real work.
+        "m4set(a, 12, 1); m4set(a, 13, 2); m4set(a, 14, 3)\n"
+        "m4set(b, 0, 0.99); m4set(b, 1, 0.099); m4set(b, 4, -0.099); m4set(b, 5, 0.99)\n"
+        "local out = m4identity()\n"
+        "for i = 1, %d do m4mul(out, a, b) end\n",
+        kLuauBufferMatMulPrelude,
+        N);
+    long long luauBuf = bestRun(luauSrc);
+
+    // Variant D: matmul on a Luau table of 16 numbers — no SIMD, no buffer
+    // reads, but every entry is a Lua TValue (8-byte tag + double).
+    snprintf(
+        luauSrc,
+        sizeof(luauSrc),
+        "local function tnew()\n"
+        "  return {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1}\n"
+        "end\n"
+        "local function tmul(out, a, b)\n"
+        "  for col = 0, 3 do\n"
+        "    for row = 0, 3 do\n"
+        "      local s = 0\n"
+        "      for k = 0, 3 do\n"
+        "        s += a[k*4 + row + 1] * b[col*4 + k + 1]\n"
+        "      end\n"
+        "      out[col*4 + row + 1] = s\n"
+        "    end\n"
+        "  end\n"
+        "end\n"
+        "local a = tnew(); a[13] = 1; a[14] = 2; a[15] = 3\n"
+        "local b = tnew(); b[1] = 0.99; b[2] = 0.099; b[5] = -0.099; b[6] = 0.99\n"
+        "local out = tnew()\n"
+        "for i = 1, %d do tmul(out, a, b) end\n",
+        N);
+    long long luauTable = bestRun(luauSrc);
+
+    fprintf(stderr,
+            "\n"
+            "Mat4 matmul perf (%d iterations, best of %d, includes VM setup):\n"
+            "  C++ a*b           : %lld us\n"
+            "  C++ multiply(out) : %lld us\n"
+            "  Luau buffer mul   : %lld us\n"
+            "  Luau table mul    : %lld us\n"
+            "\n",
+            N,
+            RUNS,
+            cppMul,
+            cppInPlace,
+            luauBuf,
+            luauTable);
+
+    // Sanity: the in-place C++ path must be at least as fast as either Luau
+    // approach. If this ever fails we have a real perf regression worth
+    // investigating.
+    CHECK(cppInPlace <= luauBuf);
+    CHECK(cppInPlace <= luauTable);
+}