feat(scripting): Mat4 affine fast paths + reverse-Z perspective (#12454) 072832aecc

Co-authored-by: Luigi Rosso <luigi-rosso@users.noreply.github.com>
diff --git a/.rive_head b/.rive_head
index d3f91ae..857b9de 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-7c539a46ff2f6ba167a5f4104f173b5861b5917f
+072832aecc271e330aa5c04ff9c8d2834f3d369e
diff --git a/include/rive/lua/rive_lua_libs.hpp b/include/rive/lua/rive_lua_libs.hpp
index 3527173..1a5674e 100644
--- a/include/rive/lua/rive_lua_libs.hpp
+++ b/include/rive/lua/rive_lua_libs.hpp
@@ -339,6 +339,7 @@
     transformPoint,
     transformVec4,
     writeToBuffer,
+    invertAffine,
 };
 
 struct ScriptedMat2D
diff --git a/include/rive/math/mat4.hpp b/include/rive/math/mat4.hpp
index 3b0351d..341ab2b 100644
--- a/include/rive/math/mat4.hpp
+++ b/include/rive/math/mat4.hpp
@@ -156,6 +156,28 @@
         return m;
     }
 
+    // Right-handed perspective with reverse-Z (near -> 1, far -> 0) and an
+    // infinite far plane. Combined with a float depth buffer this gives a
+    // near-uniform 1/z depth distribution across the entire frustum — the
+    // best precision an arbitrary scene can hope for. See Upchurch & Desbrun,
+    // "Tightening the Precision of Perspective Rendering" (2012).
+    //
+    // Caller's depth buffer must be cleared to 0 (not 1) and the depth test
+    // flipped (GREATER, not LESS).
+    static Mat4 perspectiveReverseZ(float fovYRadians,
+                                    float aspect,
+                                    float near_)
+    {
+        float f = 1.f / std::tan(fovYRadians * 0.5f);
+        Mat4 m{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+        m.m_buffer[0] = f / aspect;
+        m.m_buffer[5] = f;
+        m.m_buffer[10] = 0.f;
+        m.m_buffer[11] = -1.f;
+        m.m_buffer[14] = near_;
+        return m;
+    }
+
     // SIMD: out = lhs * rhs. Both column-major.
     static Mat4 multiply(const Mat4& lhs, const Mat4& rhs)
     {
@@ -181,6 +203,38 @@
 
     Mat4 operator*(const Mat4& rhs) const { return multiply(*this, rhs); }
 
+    // SIMD: out = lhs * rhs, assuming both are affine (bottom row
+    // [0, 0, 0, 1]). Skips the four FMAs that would multiply lhs's bottom-
+    // row zeros and the rhs[3]=0 entries of the first three columns.
+    //
+    // Result is always affine. Passing a non-affine input gives an
+    // incorrect result — callers must ensure the contract.
+    static Mat4 multiplyAffine(const Mat4& lhs, const Mat4& rhs)
+    {
+        const float* L = lhs.m_buffer.data();
+        const float* R = rhs.m_buffer.data();
+        float4 c0 = simd::load4f(L);      // [_,_,_,0]
+        float4 c1 = simd::load4f(L + 4);  // [_,_,_,0]
+        float4 c2 = simd::load4f(L + 8);  // [_,_,_,0]
+        float4 c3 = simd::load4f(L + 12); // [_,_,_,1]
+
+        Mat4 out;
+        // Cols 0..2: rhs[3] is 0, so the c3*rhs[3] term vanishes.
+        for (int j = 0; j < 3; ++j)
+        {
+            const float* rcol = R + j * 4;
+            float4 result = c0 * rcol[0] + c1 * rcol[1] + c2 * rcol[2];
+            simd::store(out.m_buffer.data() + j * 4, result);
+        }
+        // Col 3: rhs[3] is 1, so c3 contributes directly.
+        {
+            const float* rcol = R + 12;
+            float4 result = c0 * rcol[0] + c1 * rcol[1] + c2 * rcol[2] + c3;
+            simd::store(out.m_buffer.data() + 12, result);
+        }
+        return out;
+    }
+
     // SIMD: out = M * (x, y, z, w). Returns a 4-component vector (xyzw).
     void transformVec4(float out[4], float x, float y, float z, float w) const
     {
@@ -266,6 +320,66 @@
         return true;
     }
 
+    // Closed-form inverse for affine matrices (bottom row [0, 0, 0, 1]).
+    // Inverts the 3x3 linear part R via cofactors, then writes -R^-1 * t
+    // into the translation column. Much smaller and faster than the full
+    // 4x4 cofactor `invert`.
+    //
+    // Returns false (and leaves `result` unchanged) only if the linear part
+    // is singular. Caller must ensure the input is actually affine.
+    bool invertAffine(Mat4* result) const
+    {
+        const float* m = m_buffer.data();
+        // The 3x3 linear part R has R[row][col] = m[col*4 + row].
+        // Cofactors C[i][j] of column 0 of R, expanded for det along col 0.
+        float c00 = m[5] * m[10] - m[6] * m[9];
+        float c10 = m[6] * m[8] - m[4] * m[10];
+        float c20 = m[4] * m[9] - m[5] * m[8];
+        float det = m[0] * c00 + m[1] * c10 + m[2] * c20;
+        if (det == 0.f)
+            return false;
+        float invDet = 1.f / det;
+        // Remaining 6 cofactors.
+        float c01 = m[2] * m[9] - m[1] * m[10];
+        float c02 = m[1] * m[6] - m[2] * m[5];
+        float c11 = m[0] * m[10] - m[2] * m[8];
+        float c12 = m[2] * m[4] - m[0] * m[6];
+        float c21 = m[1] * m[8] - m[0] * m[9];
+        float c22 = m[0] * m[5] - m[1] * m[4];
+
+        // R^-1 = (cofactor matrix)^T / det, so Rinv[i][j] = C[j][i] / det.
+        // Naming below: ri_j = Rinv[i][j].
+        float r0_0 = c00 * invDet, r0_1 = c10 * invDet, r0_2 = c20 * invDet;
+        float r1_0 = c01 * invDet, r1_1 = c11 * invDet, r1_2 = c21 * invDet;
+        float r2_0 = c02 * invDet, r2_1 = c12 * invDet, r2_2 = c22 * invDet;
+
+        // Translation column: -R^-1 * t.
+        float tx = m[12], ty = m[13], tz = m[14];
+        float ix = -(r0_0 * tx + r0_1 * ty + r0_2 * tz);
+        float iy = -(r1_0 * tx + r1_1 * ty + r1_2 * tz);
+        float iz = -(r2_0 * tx + r2_1 * ty + r2_2 * tz);
+
+        // Store column-major: o[col*4 + row] = Rinv[row][col].
+        float* o = result->m_buffer.data();
+        o[0] = r0_0;
+        o[1] = r1_0;
+        o[2] = r2_0;
+        o[3] = 0.f;
+        o[4] = r0_1;
+        o[5] = r1_1;
+        o[6] = r2_1;
+        o[7] = 0.f;
+        o[8] = r0_2;
+        o[9] = r1_2;
+        o[10] = r2_2;
+        o[11] = 0.f;
+        o[12] = ix;
+        o[13] = iy;
+        o[14] = iz;
+        o[15] = 1.f;
+        return true;
+    }
+
 private:
     std::array<float, 16> m_buffer;
 };
diff --git a/src/lua/math/lua_mat4.cpp b/src/lua/math/lua_mat4.cpp
index 402b64a..1cb3344 100644
--- a/src/lua/math/lua_mat4.cpp
+++ b/src/lua/math/lua_mat4.cpp
@@ -80,6 +80,16 @@
     return 1;
 }
 
+// Reverse-Z infinite-far perspective. See Mat4::perspectiveReverseZ.
+static int mat4_perspectiveReverseZ(lua_State* L)
+{
+    float fov = float(luaL_checknumber(L, 1));
+    float aspect = float(luaL_checknumber(L, 2));
+    float n = float(luaL_checknumber(L, 3));
+    lua_pushmat4(L, Mat4::perspectiveReverseZ(fov, aspect, n));
+    return 1;
+}
+
 // In-place: Mat4.multiply(out, a, b)  ->  out = a * b. Returns out.
 // Avoids per-call userdata allocation in tight loops.
 static int mat4_static_multiply(lua_State* L)
@@ -92,6 +102,19 @@
     return 1;
 }
 
+// In-place: Mat4.multiplyAffine(out, a, b)  ->  out = a * b, assuming both
+// inputs are affine (bottom row [0,0,0,1]). Faster than `multiply` (skips
+// the bottom-row work).
+static int mat4_static_multiplyAffine(lua_State* L)
+{
+    auto out = lua_torive<ScriptedMat4>(L, 1);
+    auto a = lua_torive<ScriptedMat4>(L, 2);
+    auto b = lua_torive<ScriptedMat4>(L, 3);
+    out->value = Mat4::multiplyAffine(a->value, b->value);
+    lua_pushvalue(L, 1);
+    return 1;
+}
+
 static int mat4_static_invert(lua_State* L)
 {
     auto out = lua_torive<ScriptedMat4>(L, 1);
@@ -100,6 +123,16 @@
     return 1;
 }
 
+// In-place: Mat4.invertAffine(out, in) — closed-form affine inverse.
+// Returns true if invertible. Caller must ensure the input is affine.
+static int mat4_static_invertAffine(lua_State* L)
+{
+    auto out = lua_torive<ScriptedMat4>(L, 1);
+    auto in = lua_torive<ScriptedMat4>(L, 2);
+    lua_pushboolean(L, in->value.invertAffine(&out->value));
+    return 1;
+}
+
 // Field index lookup. Supports m11..m44 (row,col 1-indexed) and 1..16
 // (column-major linear index, 1-indexed).
 static int mat4_index_field(lua_State* L,
@@ -219,6 +252,19 @@
     return 1;
 }
 
+static int mat4_invertAffine(lua_State* L)
+{
+    auto mat = lua_torive<ScriptedMat4>(L, 1);
+    Mat4 result;
+    if (mat->value.invertAffine(&result))
+    {
+        lua_pushmat4(L, result);
+        return 1;
+    }
+    lua_pushnil(L);
+    return 1;
+}
+
 static int mat4_transpose(lua_State* L)
 {
     auto mat = lua_torive<ScriptedMat4>(L, 1);
@@ -293,6 +339,8 @@
         {
             case (int)LuaAtoms::invert:
                 return mat4_invert(L);
+            case (int)LuaAtoms::invertAffine:
+                return mat4_invertAffine(L);
             case (int)LuaAtoms::transpose:
                 return mat4_transpose(L);
             case (int)LuaAtoms::transformPoint:
@@ -319,8 +367,11 @@
     {"fromRotationY", mat4_fromRotationY},
     {"fromRotationZ", mat4_fromRotationZ},
     {"perspective", mat4_perspective},
+    {"perspectiveReverseZ", mat4_perspectiveReverseZ},
     {"multiply", mat4_static_multiply},
+    {"multiplyAffine", mat4_static_multiplyAffine},
     {"invert", mat4_static_invert},
+    {"invertAffine", mat4_static_invertAffine},
     {nullptr, nullptr}};
 
 int luaopen_rive_mat4(lua_State* L)
diff --git a/src/lua/rive_lua_libs.cpp b/src/lua/rive_lua_libs.cpp
index fcce41b..7c9ac12 100644
--- a/src/lua/rive_lua_libs.cpp
+++ b/src/lua/rive_lua_libs.cpp
@@ -250,6 +250,7 @@
     {"transformPoint", (int16_t)LuaAtoms::transformPoint},
     {"transformVec4", (int16_t)LuaAtoms::transformVec4},
     {"writeToBuffer", (int16_t)LuaAtoms::writeToBuffer},
+    {"invertAffine", (int16_t)LuaAtoms::invertAffine},
 };
 
 static const luaL_Reg lualibs[] = {
diff --git a/tests/unit_tests/runtime/scripting/scripting_mat4_test.cpp b/tests/unit_tests/runtime/scripting/scripting_mat4_test.cpp
index 3562c01..0b4fe1f 100644
--- a/tests/unit_tests/runtime/scripting/scripting_mat4_test.cpp
+++ b/tests/unit_tests/runtime/scripting/scripting_mat4_test.cpp
@@ -125,6 +125,83 @@
     CHECK(lua_tonumber(t.state(), -1) == 4.0);
 }
 
+TEST_CASE("Mat4.multiplyAffine matches multiply for affine inputs",
+          "[scripting]")
+{
+    // For two affine matrices the fast and slow paths must agree
+    // bit-exactly on every entry.
+    const char* src =
+        "local a = Mat4.fromTranslation(3, -1, 5) * Mat4.fromRotationY(0.7)\n"
+        "local b = Mat4.fromScale(2, 0.5, 1) * Mat4.fromRotationZ(-0.3)\n"
+        "local slow = Mat4.identity()\n"
+        "local fast = Mat4.identity()\n"
+        "Mat4.multiply(slow, a, b)\n"
+        "Mat4.multiplyAffine(fast, a, b)\n"
+        // Sum |slow[i] - fast[i]| over i=1..16; must be 0.
+        "local diff = 0\n"
+        "for i = 1, 16 do diff = diff + math.abs(slow[i] - fast[i]) end\n"
+        "return diff, fast.m41, fast.m42, fast.m43, fast.m44\n";
+    auto t = ScriptingTest(src, 5);
+    CHECK(lua_tonumber(t.state(), -5) == 0.0);
+    // Bottom row stays [0, 0, 0, 1] (affine invariant).
+    CHECK(lua_tonumber(t.state(), -4) == 0.0);
+    CHECK(lua_tonumber(t.state(), -3) == 0.0);
+    CHECK(lua_tonumber(t.state(), -2) == 0.0);
+    CHECK(lua_tonumber(t.state(), -1) == 1.0);
+}
+
+TEST_CASE("Mat4:invertAffine round-trips", "[scripting]")
+{
+    const char* src =
+        "local m = Mat4.fromTranslation(3, -4, 5) * Mat4.fromRotationY(0.4)"
+        " * Mat4.fromScale(2, 2, 2)\n"
+        "local inv = m:invertAffine()\n"
+        "assert(inv ~= nil)\n"
+        "local r = m * inv\n"
+        "return math.abs(r.m11 - 1) + math.abs(r.m22 - 1)"
+        " + math.abs(r.m33 - 1) + math.abs(r.m44 - 1)"
+        " + math.abs(r.m14) + math.abs(r.m24) + math.abs(r.m34)\n";
+    auto t = ScriptingTest(src);
+    CHECK(lua_tonumber(t.state(), -1) < 1e-5);
+}
+
+TEST_CASE("Mat4.invertAffine writes in place", "[scripting]")
+{
+    const char* src = "local m = Mat4.fromTranslation(10, 0, 0)\n"
+                      "local out = Mat4.identity()\n"
+                      "local ok = Mat4.invertAffine(out, m)\n"
+                      "return ok, out.m14, out.m24, out.m34\n";
+    auto t = ScriptingTest(src, 4);
+    CHECK(lua_toboolean(t.state(), -4) == 1);
+    CHECK(lua_tonumber(t.state(), -3) == -10.0);
+    CHECK(lua_tonumber(t.state(), -2) == 0.0);
+    CHECK(lua_tonumber(t.state(), -1) == 0.0);
+}
+
+TEST_CASE("Mat4:invertAffine returns nil on singular linear part",
+          "[scripting]")
+{
+    // Zero scale on Y collapses the linear part — singular.
+    const char* src = "local m = Mat4.fromScale(2, 0, 1)\n"
+                      "return m:invertAffine()\n";
+    auto t = ScriptingTest(src);
+    CHECK(lua_isnil(t.state(), -1));
+}
+
+TEST_CASE("Mat4.perspectiveReverseZ has expected layout", "[scripting]")
+{
+    // For aspect=1, fovY=90deg: f = 1/tan(45deg) = 1.
+    // m11 = f/aspect = 1, m22 = f = 1, m33 = 0, m43 = -1, m34 = near.
+    const char* src = "local p = Mat4.perspectiveReverseZ(math.rad(90), 1, 5)\n"
+                      "return p.m11, p.m22, p.m33, p.m43, p.m34\n";
+    auto t = ScriptingTest(src, 5);
+    CHECK(lua_tonumber(t.state(), -5) == Approx(1.0).margin(1e-6));
+    CHECK(lua_tonumber(t.state(), -4) == Approx(1.0).margin(1e-6));
+    CHECK(lua_tonumber(t.state(), -3) == 0.0);
+    CHECK(lua_tonumber(t.state(), -2) == -1.0);
+    CHECK(lua_tonumber(t.state(), -1) == 5.0);
+}
+
 TEST_CASE("Mat4:writeToBuffer stores 64 bytes column-major", "[scripting]")
 {
     const char* src =