ore: gate float render targets and baseVertex on backend features (#12712) 10e90773d5
* ore: gate float render targets and baseVertex on backend features

* ore: test float renderTarget and baseVertex feature guards

fix(ore): d3d12 upload stages caller region, not full subresource (#12702) 0cc614761c

fix(ore): ContextVulkan TextureVulkan upload defer to next recording CB (#12700) cccee5e480
* fix ContextVulkan TextureVulkan upload defer to next recording CB

* ore vulkan upload: batch flush barriers, fix array depth default, 64-bit region bounds

* ore vulkan upload: 64-bit bytesPerRow fallback before uint32_t guard

Co-authored-by: Luigi Rosso <luigi-rosso@users.noreply.github.com>
diff --git a/.rive_head b/.rive_head
index 8ff6a2b..1ac6e20 100644
--- a/.rive_head
+++ b/.rive_head
@@ -1 +1 @@
-557c18ea05eb537d3812cebec0564a47218d8f67
+10e90773d57ba32506366d563cd738f4136f8e3b
diff --git a/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp b/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp
index 1caa2a6..907b6ed 100644
--- a/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp
+++ b/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp
@@ -193,6 +193,21 @@
     void vkQueueTransitionToLayout(Texture* texture,
                                    VkImageAspectFlags aspectMask,
                                    VkImageLayout newLayout);
+
+    // Deferred texture uploads: upload() can be called with no recording
+    // CB (verify hooks, scripted shader setup). Drained on the host CB
+    // at the next beginFrame / beginRenderPass.
+    struct VkPendingTextureUpload
+    {
+        // Base types so this header doesn't need TextureVulkan / BufferVulkan.
+        rcp<Texture> texture;
+        rcp<Buffer> stagingBuffer;
+        VkBufferImageCopy region;
+        VkImageAspectFlags aspectMask;
+    };
+    std::vector<VkPendingTextureUpload> m_vkPendingTextureUploads;
+    void vkQueuePendingTextureUpload(VkPendingTextureUpload pending);
+    void vkFlushPendingTextureUploads();
 };
 
 } // namespace rive::ore
diff --git a/renderer/include/rive/renderer/ore/ore_types.hpp b/renderer/include/rive/renderer/ore/ore_types.hpp
index 3f5e4d9..c6d4d7b 100644
--- a/renderer/include/rive/renderer/ore/ore_types.hpp
+++ b/renderer/include/rive/renderer/ore/ore_types.hpp
@@ -710,7 +710,11 @@
 
 struct Features
 {
+    // 32-bit float color render targets (rgba32float etc).
     bool colorBufferFloat = false;
+    // 16-bit float color render targets (rgba16float etc). Implied by
+    // colorBufferFloat, but on WebGL can be present without it.
+    bool colorBufferHalfFloat = false;
     bool perTargetBlend = false;
     bool perTargetWriteMask = false;
     bool textureViewSampling = false;
diff --git a/renderer/src/ore/d3d12/ore_texture_d3d12.cpp b/renderer/src/ore/d3d12/ore_texture_d3d12.cpp
index bf86c6f..1d3c710 100644
--- a/renderer/src/ore/d3d12/ore_texture_d3d12.cpp
+++ b/renderer/src/ore/d3d12/ore_texture_d3d12.cpp
@@ -10,6 +10,7 @@
 #include <d3d12.h>
 #include <wrl/client.h>
 #include <cassert>
+#include <cstdint>
 #include <cstring>
 
 using Microsoft::WRL::ComPtr;
@@ -20,42 +21,139 @@
 void TextureD3D12::upload(const TextureDataDesc& data)
 {
 #if defined(ORE_BACKEND_D3D12)
+    assert(m_d3dOreContext != nullptr);
+    ContextD3D12* ctx = m_d3dOreContext;
+    // Internal invariants: a live, non-external texture always has these.
     assert(m_d3dTexture != nullptr);
     assert(m_d3dDevice != nullptr);
-    assert(m_d3dOreContext != nullptr);
-    assert(!m_d3dIsExternal &&
-           "Cannot upload into an external (canvas-wrapped) texture");
-    assert(data.data != nullptr);
 
-    ContextD3D12* ctx = m_d3dOreContext;
+    // Script-reachable failures report through lastError instead of aborting.
+    if (m_d3dIsExternal)
+    {
+        ctx->setLastError(
+            "upload: cannot upload into an external (canvas-wrapped) texture");
+        return;
+    }
+    if (data.data == nullptr)
+    {
+        ctx->setLastError("upload: data is null");
+        return;
+    }
 
-    // Compute the placed footprint for the requested subresource.
-    // D3D12 subresource index formula:
-    //   `mipSlice + arraySlice * MipLevels + planeSlice * MipLevels *
-    //   ArraySize`
-    // (see D3D12CalcSubresource in d3dx12.h). Pre-fix this was
-    // `mipLevel * DepthOrArraySize + layer`, which is the *transposed*
-    // formula — works only when MipLevels=1 (every Ore GM today) and
-    // silently mis-targets the wrong subresource for any mipmapped
-    // array texture.
+    const uint32_t bpt = textureFormatBytesPerTexel(m_format);
+    // mipLevel must index a declared level.
+    if (data.mipLevel >= m_numMipmaps)
+    {
+        ctx->setLastError("upload: mipLevel (%u) >= numMipmaps (%u)",
+                          data.mipLevel,
+                          m_numMipmaps);
+        return;
+    }
+    // layer must index a declared slice (1 for non-array/non-cube).
+    if (data.layer >= m_depthOrArrayLayers)
+    {
+        ctx->setLastError("upload: layer (%u) >= depthOrArrayLayers (%u)",
+                          data.layer,
+                          m_depthOrArrayLayers);
+        return;
+    }
+    // Mip-adjusted extents (floor(max(1, dim >> mipLevel))).
+    const uint32_t mipWidth =
+        (m_width >> data.mipLevel) > 0 ? (m_width >> data.mipLevel) : 1u;
+    const uint32_t mipHeight =
+        (m_height >> data.mipLevel) > 0 ? (m_height >> data.mipLevel) : 1u;
+    const uint32_t width = data.width > 0 ? data.width : mipWidth;
+    const uint32_t height = data.height > 0 ? data.height : mipHeight;
+    // Only 3D carries depth > 1. The 2D family picks its slice via the
+    // subresource index and uploads at depth 1.
+    const uint32_t maxDepth =
+        m_type == TextureType::texture3D ? m_depthOrArrayLayers : 1u;
+    const uint32_t depth = data.depth > 0 ? data.depth : maxDepth;
+    // Region must fit the mip extent. 64-bit so a big offset can't wrap.
+    if (static_cast<uint64_t>(data.x) + width > mipWidth ||
+        static_cast<uint64_t>(data.y) + height > mipHeight)
+    {
+        ctx->setLastError("upload: region (x=%u y=%u w=%u h=%u) out of bounds "
+                          "for mip %u (%ux%u)",
+                          data.x,
+                          data.y,
+                          width,
+                          height,
+                          data.mipLevel,
+                          mipWidth,
+                          mipHeight);
+        return;
+    }
+    if (static_cast<uint64_t>(data.z) + depth > maxDepth)
+    {
+        ctx->setLastError(
+            "upload: z-region (z=%u depth=%u) out of bounds (maxDepth=%u)",
+            data.z,
+            depth,
+            maxDepth);
+        return;
+    }
+    // bytesPerTexel == 0 is block-compressed. No block-aware path yet.
+    if (bpt == 0)
+    {
+        ctx->setLastError("upload: block-compressed formats not yet supported");
+        return;
+    }
+    if (data.bytesPerRow != 0 && (data.bytesPerRow % bpt) != 0)
+    {
+        ctx->setLastError("upload: bytesPerRow (%u) must be a whole number of "
+                          "texels (bytesPerTexel=%u)",
+                          data.bytesPerRow,
+                          bpt);
+        return;
+    }
+    if (data.bytesPerRow != 0 &&
+        data.bytesPerRow < static_cast<uint64_t>(width) * bpt)
+    {
+        ctx->setLastError(
+            "upload: bytesPerRow (%u) < width * bytesPerTexel (%llu)",
+            data.bytesPerRow,
+            static_cast<unsigned long long>(static_cast<uint64_t>(width) *
+                                            bpt));
+        return;
+    }
+    if (data.rowsPerImage > 0 && data.rowsPerImage < height)
+    {
+        ctx->setLastError("upload: rowsPerImage (%u) < height (%u)",
+                          data.rowsPerImage,
+                          height);
+        return;
+    }
+
+    // Subresource index: mipSlice + arraySlice * MipLevels. Transposing it
+    // only works when MipLevels == 1.
     D3D12_RESOURCE_DESC texDesc = m_d3dTexture->GetDesc();
-    UINT subresource =
+    const UINT subresource =
         data.mipLevel + data.layer * static_cast<UINT>(texDesc.MipLevels);
 
-    D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {};
-    UINT numRows = 0;
-    UINT64 rowBytes = 0;
-    UINT64 totalBytes = 0;
-    m_d3dDevice->GetCopyableFootprints(&texDesc,
-                                       subresource,
-                                       1,
-                                       /*baseOffset=*/0,
-                                       &footprint,
-                                       &numRows,
-                                       &rowBytes,
-                                       &totalBytes);
+    // Stage just the caller region: caller pitch on read, 256-aligned pitch on
+    // write. 64-bit so the total can't wrap before the uint32_t guard.
+    const uint64_t srcRow = data.bytesPerRow != 0
+                                ? data.bytesPerRow
+                                : static_cast<uint64_t>(width) * bpt;
+    const uint32_t rowsPerImage =
+        data.rowsPerImage > 0 ? data.rowsPerImage : height;
+    const uint64_t srcSliceStride = srcRow * rowsPerImage;
+    const uint64_t copyRowBytes = static_cast<uint64_t>(width) * bpt;
+    const uint64_t dstRowPitch =
+        (copyRowBytes + (D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1)) &
+        ~static_cast<uint64_t>(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1);
+    const uint64_t totalBytes = dstRowPitch * height * depth;
+    // BufferD3D12 size is uint32_t. Refuse rather than truncate.
+    if (totalBytes > UINT32_MAX)
+    {
+        ctx->setLastError(
+            "upload: size (%llu) exceeds uint32_t staging buffer max",
+            static_cast<unsigned long long>(totalBytes));
+        return;
+    }
 
-    // Allocate a staging (UPLOAD heap) buffer.
+    // Allocate a staging (UPLOAD heap) buffer sized to the region footprint.
     D3D12_HEAP_PROPERTIES uploadHeap = {};
     uploadHeap.Type = D3D12_HEAP_TYPE_UPLOAD;
 
@@ -75,28 +173,45 @@
                                          static_cast<uint32_t>(totalBytes),
                                          BufferUsage::upload));
 
-    [[maybe_unused]] HRESULT hr = m_d3dDevice->CreateCommittedResource(
+    HRESULT hr = m_d3dDevice->CreateCommittedResource(
         &uploadHeap,
         D3D12_HEAP_FLAG_NONE,
         &bufDesc,
         D3D12_RESOURCE_STATE_GENERIC_READ,
         nullptr,
         IID_PPV_ARGS(m_uploadBuffer->m_d3dBuffer.GetAddressOf()));
-    assert(SUCCEEDED(hr) && "Texture::upload: staging buffer creation failed");
+    if (FAILED(hr))
+    {
+        m_uploadBuffer = nullptr;
+        ctx->setLastError("upload: staging buffer creation failed (hr=0x%08x)",
+                          static_cast<unsigned>(hr));
+        return;
+    }
 
-    // Map and copy row by row, respecting the D3D12 pitch alignment.
     void* mapped = nullptr;
     D3D12_RANGE readRange = {0, 0};
-    m_uploadBuffer->m_d3dBuffer->Map(0, &readRange, &mapped);
+    hr = m_uploadBuffer->m_d3dBuffer->Map(0, &readRange, &mapped);
+    if (FAILED(hr) || mapped == nullptr)
+    {
+        m_uploadBuffer = nullptr;
+        ctx->setLastError("upload: staging buffer map failed (hr=0x%08x)",
+                          static_cast<unsigned>(hr));
+        return;
+    }
 
+    // Copy the region row by row, never the full subresource (the over-read).
     const uint8_t* src = static_cast<const uint8_t*>(data.data);
     uint8_t* dst = static_cast<uint8_t*>(mapped);
-    UINT64 srcRow = data.bytesPerRow ? data.bytesPerRow : rowBytes;
-    UINT64 dstRow = footprint.Footprint.RowPitch;
-
-    for (UINT row = 0; row < numRows; ++row)
-        memcpy(dst + row * dstRow, src + row * srcRow, (size_t)rowBytes);
-
+    for (uint32_t z = 0; z < depth; ++z)
+    {
+        for (uint32_t y = 0; y < height; ++y)
+        {
+            memcpy(dst + (static_cast<uint64_t>(z) * height + y) * dstRowPitch,
+                   src + static_cast<uint64_t>(z) * srcSliceStride +
+                       static_cast<uint64_t>(y) * srcRow,
+                   static_cast<size_t>(copyRowBytes));
+        }
+    }
     m_uploadBuffer->m_d3dBuffer->Unmap(0, nullptr);
 
     // If the texture is not in COPY_DEST state, transition it.
@@ -113,27 +228,31 @@
         m_d3dCurrentState = D3D12_RESOURCE_STATE_COPY_DEST;
     }
 
-    // Record the copy.
+    // Copy the whole staged region to (x, y, z). No src box needed.
     D3D12_TEXTURE_COPY_LOCATION dst_loc = {};
     dst_loc.pResource = m_d3dTexture.Get();
     dst_loc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
     dst_loc.SubresourceIndex = subresource;
 
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {};
+    footprint.Offset = 0;
+    footprint.Footprint.Format = texDesc.Format;
+    footprint.Footprint.Width = width;
+    footprint.Footprint.Height = height;
+    footprint.Footprint.Depth = depth;
+    footprint.Footprint.RowPitch = static_cast<UINT>(dstRowPitch);
+
     D3D12_TEXTURE_COPY_LOCATION src_loc = {};
     src_loc.pResource = m_uploadBuffer->m_d3dBuffer.Get();
     src_loc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
     src_loc.PlacedFootprint = footprint;
 
-    D3D12_BOX box = {};
-    box.left = data.x;
-    box.top = data.y;
-    box.front = data.z;
-    box.right = data.x + data.width;
-    box.bottom = data.y + data.height;
-    box.back = data.z + data.depth;
-
-    ctx->m_d3dCmdList
-        ->CopyTextureRegion(&dst_loc, data.x, data.y, data.z, &src_loc, &box);
+    ctx->m_d3dCmdList->CopyTextureRegion(&dst_loc,
+                                         data.x,
+                                         data.y,
+                                         data.z,
+                                         &src_loc,
+                                         nullptr);
 
     // Transition to PIXEL_SHADER_RESOURCE so it's ready to sample without
     // an explicit barrier in the render pass.
diff --git a/renderer/src/ore/gl/ore_context_gl.cpp b/renderer/src/ore/gl/ore_context_gl.cpp
index 4ef34d5..52da6a4 100644
--- a/renderer/src/ore/gl/ore_context_gl.cpp
+++ b/renderer/src/ore/gl/ore_context_gl.cpp
@@ -239,7 +239,11 @@
         if (!ext)
             continue;
         if (strcmp(ext, "GL_EXT_color_buffer_float") == 0)
+        {
+            // Full-float renderability implies half-float renderability.
             f.colorBufferFloat = true;
+            f.colorBufferHalfFloat = true;
+        }
         else if (strcmp(ext, "GL_EXT_texture_filter_anisotropic") == 0)
             f.anisotropicFiltering = true;
         else if (strcmp(ext, "GL_KHR_texture_compression_astc_ldr") == 0)
@@ -250,6 +254,27 @@
 #endif
     }
 
+    // drawBaseInstance stays false: the GL render pass does not yet wire
+    // baseVertex/firstInstance through the base-vertex-base-instance draw
+    // call, so reporting it true would silently drop those args.
+
+#ifdef RIVE_WEBGL
+    // On WebGL an extension must be enabled before its functionality (and
+    // sometimes its name in the extension list) is available, so the scan
+    // above misses it. Enable returns whether the browser supports it.
+    // The two extensions are independent: half-float gates 16-bit float
+    // targets, full-float gates 32-bit, and full-float implies half-float.
+    auto webglCtx = emscripten_webgl_get_current_context();
+    if (emscripten_webgl_enable_extension(webglCtx, "EXT_color_buffer_float"))
+    {
+        f.colorBufferFloat = true;
+        f.colorBufferHalfFloat = true;
+    }
+    if (emscripten_webgl_enable_extension(webglCtx,
+                                          "EXT_color_buffer_half_float"))
+        f.colorBufferHalfFloat = true;
+#endif
+
     return ctx;
 }
 
diff --git a/renderer/src/ore/gl/ore_render_pass_gl.cpp b/renderer/src/ore/gl/ore_render_pass_gl.cpp
index aa16243..2c12e2b 100644
--- a/renderer/src/ore/gl/ore_render_pass_gl.cpp
+++ b/renderer/src/ore/gl/ore_render_pass_gl.cpp
@@ -499,6 +499,8 @@
     assert(m_currentPipeline != nullptr);
     GLenum mode = oreTopologyToGL(m_currentPipeline->desc().topology);
 
+    // drawBaseInstance=false on GL, so the Lua guard rejects firstInstance>0.
+    // Wiring needs glDrawArraysInstancedBaseInstance (ANGLE).
     (void)firstInstance;
 
     if (instanceCount > 1)
@@ -524,6 +526,9 @@
     const void* offset = reinterpret_cast<const void*>(
         static_cast<uintptr_t>(firstIndex * indexSize));
 
+    // drawBaseInstance=false on GL, so the Lua guard rejects baseVertex!=0 and
+    // firstInstance>0. Wiring needs
+    // glDrawElementsInstancedBaseVertexBaseInstance (ANGLE).
     (void)baseVertex;
     (void)firstInstance;
 
diff --git a/renderer/src/ore/vulkan/ore_context_vulkan.cpp b/renderer/src/ore/vulkan/ore_context_vulkan.cpp
index bc1c8a5..7c00f05 100644
--- a/renderer/src/ore/vulkan/ore_context_vulkan.cpp
+++ b/renderer/src/ore/vulkan/ore_context_vulkan.cpp
@@ -506,6 +506,152 @@
     m_vkPendingInitialTransitions.clear();
 }
 
+void ContextVulkan::vkQueuePendingTextureUpload(VkPendingTextureUpload pending)
+{
+    // No dedupe: uploads target sub-regions (offset, mipLevel, layer),
+    // so multiple calls to the same texture are all meaningful.
+    m_vkPendingTextureUploads.push_back(std::move(pending));
+}
+
+void ContextVulkan::vkFlushPendingTextureUploads()
+{
+    if (m_vkPendingTextureUploads.empty())
+        return;
+    // Caller contract: only invoked from beginFrame / beginRenderPass,
+    // both of which run with a host-owned recording CB.
+    assert(m_vkCmdBufRecording && m_vkCommandBuffer != VK_NULL_HANDLE);
+
+    // Resolve the valid uploads and collect the distinct destination images.
+    // Barriers batch per image (one to-transfer + one to-shader for the whole
+    // flush) instead of two per upload, so an N-layer array texture costs 2
+    // barriers, not 2N.
+    struct DistinctTex
+    {
+        TextureVulkan* tex;
+        VkImageLayout oldLayout;
+        VkImageAspectFlags aspectMask;
+    };
+    std::vector<DistinctTex> distinct;
+    struct ValidUpload
+    {
+        VkBuffer srcBuffer;
+        VkImage dstImage;
+        const VkBufferImageCopy* region;
+    };
+    std::vector<ValidUpload> uploads;
+    uploads.reserve(m_vkPendingTextureUploads.size());
+    for (auto& pu : m_vkPendingTextureUploads)
+    {
+        auto* vkTex = lite_rtti_cast<TextureVulkan*>(pu.texture.get());
+        auto* vkBuf = lite_rtti_cast<BufferVulkan*>(pu.stagingBuffer.get());
+        // Texture or staging buffer torn down before flush. Skip silently.
+        if (vkTex == nullptr || vkBuf == nullptr ||
+            vkTex->m_vkImage == VK_NULL_HANDLE)
+            continue;
+        uploads.push_back({vkBuf->m_vkBuffer, vkTex->m_vkImage, &pu.region});
+        bool found = false;
+        for (auto& d : distinct)
+        {
+            if (d.tex == vkTex)
+            {
+                d.aspectMask |= pu.aspectMask;
+                found = true;
+                break;
+            }
+        }
+        if (!found)
+            distinct.push_back({vkTex, vkTex->m_vkLayout, pu.aspectMask});
+    }
+    if (uploads.empty())
+    {
+        m_vkPendingTextureUploads.clear();
+        return;
+    }
+
+    // 1) Transition every distinct image to TRANSFER_DST in one barrier. The
+    // src scope comes from each image's tracked layout so the upload waits on
+    // any prior use (attachment write, shader read) instead of racing it.
+    std::vector<VkImageMemoryBarrier> toTransfer;
+    toTransfer.reserve(distinct.size());
+    VkPipelineStageFlags srcStageAcc = 0;
+    for (auto& d : distinct)
+    {
+        VkPipelineStageFlags srcStage;
+        VkAccessFlags srcAccess;
+        pipelineStageAndAccessForLayout(d.oldLayout, &srcStage, &srcAccess);
+        srcStageAcc |= srcStage;
+        VkImageMemoryBarrier b{};
+        b.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+        b.srcAccessMask = srcAccess;
+        b.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        b.oldLayout = d.oldLayout;
+        b.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+        b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        b.image = d.tex->m_vkImage;
+        b.subresourceRange.aspectMask = d.aspectMask;
+        b.subresourceRange.baseMipLevel = 0;
+        b.subresourceRange.levelCount = VK_REMAINING_MIP_LEVELS;
+        b.subresourceRange.baseArrayLayer = 0;
+        b.subresourceRange.layerCount = VK_REMAINING_ARRAY_LAYERS;
+        toTransfer.push_back(b);
+    }
+    m_vk->CmdPipelineBarrier(m_vkCommandBuffer,
+                             srcStageAcc,
+                             VK_PIPELINE_STAGE_TRANSFER_BIT,
+                             0,
+                             0,
+                             nullptr,
+                             0,
+                             nullptr,
+                             static_cast<uint32_t>(toTransfer.size()),
+                             toTransfer.data());
+
+    // 2) Copy each staged region. Disjoint sub-regions (distinct offset / mip
+    // / layer) need no inter-copy barrier.
+    for (auto& u : uploads)
+    {
+        m_vk->CmdCopyBufferToImage(m_vkCommandBuffer,
+                                   u.srcBuffer,
+                                   u.dstImage,
+                                   VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                                   1,
+                                   u.region);
+    }
+
+    // 3) Transition every distinct image back to SHADER_READ_ONLY in one
+    // barrier.
+    VkPipelineStageFlags dstStage;
+    VkAccessFlags dstAccess;
+    pipelineStageAndAccessForLayout(VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                    &dstStage,
+                                    &dstAccess);
+    std::vector<VkImageMemoryBarrier> toShader = toTransfer;
+    for (auto& b : toShader)
+    {
+        b.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+        b.dstAccessMask = dstAccess;
+        b.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+        b.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+    }
+    m_vk->CmdPipelineBarrier(m_vkCommandBuffer,
+                             VK_PIPELINE_STAGE_TRANSFER_BIT,
+                             dstStage,
+                             0,
+                             0,
+                             nullptr,
+                             0,
+                             nullptr,
+                             static_cast<uint32_t>(toShader.size()),
+                             toShader.data());
+    for (auto& d : distinct)
+        d.tex->m_vkLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+    // Dropping the rcps releases the staging buffers; GPUResource
+    // purgatory keeps them alive until safeFrameNumber.
+    m_vkPendingTextureUploads.clear();
+}
+
 void ContextVulkan::beginFrame(const FrameDescriptor& desc)
 {
     assert(desc.externalCommandBuffer != VK_NULL_HANDLE);
@@ -519,8 +665,8 @@
         static_cast<VkCommandBuffer>(desc.externalCommandBuffer);
     m_vkCmdBufRecording = true;
 
-    // Flush any lazy-init barriers queued before this frame.  Host has
-    // already called BeginCommandBuffer on externalCb, so recording is safe.
+    // Drain pre-frame deferred work onto the host's CB.
+    vkFlushPendingTextureUploads();
     vkFlushPendingInitialTransitions();
 }
 
@@ -1409,10 +1555,9 @@
     rpBI.clearValueCount = attachCount;
     rpBI.pClearValues = clearValues;
 
-    // Flush any pending sampled-image transitions (queued by makeBindGroup
-    // and the loadOp=load attachment branches above) before entering the
-    // pass — barriers inside a pass are restricted to declared self-
-    // dependencies, so cross-pass image-layout work must land here.
+    // Drain barriers and uploads outside the pass; transfers and
+    // unrestricted barriers are not allowed inside one.
+    vkFlushPendingTextureUploads();
     vkFlushPendingInitialTransitions();
 
     m_vk->CmdBeginRenderPass(m_vkCommandBuffer,
diff --git a/renderer/src/ore/vulkan/ore_texture_vulkan.cpp b/renderer/src/ore/vulkan/ore_texture_vulkan.cpp
index 5ecaea6..50ce778 100644
--- a/renderer/src/ore/vulkan/ore_texture_vulkan.cpp
+++ b/renderer/src/ore/vulkan/ore_texture_vulkan.cpp
@@ -49,132 +49,147 @@
     return VK_IMAGE_ASPECT_COLOR_BIT;
 }
 
-// Transition an image from one layout to another using a pipeline barrier
-// on the given command buffer.
-static void transitionLayout(PFN_vkCmdPipelineBarrier pfnCmdPipelineBarrier,
-                             VkCommandBuffer cmdBuf,
-                             VkImage image,
-                             VkImageAspectFlags aspectMask,
-                             VkImageLayout oldLayout,
-                             VkImageLayout newLayout,
-                             uint32_t mipCount = 1,
-                             uint32_t layerCount = 1)
-{
-    VkImageMemoryBarrier barrier{};
-    barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
-    barrier.oldLayout = oldLayout;
-    barrier.newLayout = newLayout;
-    barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    barrier.image = image;
-    barrier.subresourceRange.aspectMask = aspectMask;
-    barrier.subresourceRange.baseMipLevel = 0;
-    barrier.subresourceRange.levelCount = mipCount;
-    barrier.subresourceRange.baseArrayLayer = 0;
-    barrier.subresourceRange.layerCount = layerCount;
-
-    VkPipelineStageFlags srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-    VkPipelineStageFlags dstStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-
-    if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
-        newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
-    {
-        barrier.srcAccessMask = 0;
-        barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-        srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-        dstStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
-    }
-    else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL &&
-             newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
-    {
-        barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
-        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-        srcStage = VK_PIPELINE_STAGE_TRANSFER_BIT;
-        dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
-    }
-    else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
-             newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
-    {
-        barrier.srcAccessMask = 0;
-        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-        srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-        dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
-    }
-    else if (oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL &&
-             newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL)
-    {
-        barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT;
-        barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-        srcStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
-        dstStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-    }
-    else if (oldLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL &&
-             newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
-    {
-        barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-        barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-        srcStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-        dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
-    }
-    else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
-             newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL)
-    {
-        barrier.srcAccessMask = 0;
-        barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
-                                VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
-        srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-        dstStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT;
-    }
-    else
-    {
-        // Generic fallback — conservative but correct.
-        barrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
-        barrier.dstAccessMask =
-            VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT;
-        srcStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
-        dstStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
-    }
-
-    pfnCmdPipelineBarrier(cmdBuf,
-                          srcStage,
-                          dstStage,
-                          0,
-                          0,
-                          nullptr,
-                          0,
-                          nullptr,
-                          1,
-                          &barrier);
-}
-
 // ============================================================================
 // Texture
 // ============================================================================
 
 void TextureVulkan::upload(const TextureDataDesc& data)
 {
-    // Staging upload records into whichever command buffer the owning
-    // Context currently has open — either Ore's own CB (owned-CB mode) or
-    // the host's CB (external-CB mode). Resolved at call time so the same
-    // Texture can span both modes across its lifetime.
+    // Stage CPU-side, queue for the next host CB. Callers may run with
+    // no recording CB (verify hooks, scripted shader setup).
     assert(m_vkOreContext != nullptr);
-    // Make sure the owned cb is in the recording state — scripts can call
-    // upload() outside a host-driven frame window (verify hooks during
-    // artboard construction, scripted shader effects during PLS draw).
-    VkCommandBuffer cmdBuf = m_vkOreContext->m_vkCommandBuffer;
-    auto pfnCmdPipelineBarrier = m_vkOreContext->m_vk->CmdPipelineBarrier;
-    auto pfnCmdCopyBufferToImage = m_vkOreContext->m_vk->CmdCopyBufferToImage;
-    assert(cmdBuf != VK_NULL_HANDLE);
     assert(m_vkImage != VK_NULL_HANDLE);
+    if (data.data == nullptr)
+    {
+        m_vkOreContext->setLastError("upload: data is null");
+        return;
+    }
 
-    // Determine byte size of the upload region.
-    uint32_t bytesPerRow = data.bytesPerRow;
-    uint32_t height = data.height > 0 ? data.height : m_height;
-    uint32_t depth = data.depth > 0 ? data.depth : m_depthOrArrayLayers;
-    VkDeviceSize uploadSize =
-        static_cast<VkDeviceSize>(bytesPerRow) * height * depth;
+    const uint32_t bptVK = textureFormatBytesPerTexel(m_format);
+    // mipLevel must index a declared level.
+    if (data.mipLevel >= m_numMipmaps)
+    {
+        m_vkOreContext->setLastError("upload: mipLevel (%u) >= numMipmaps (%u)",
+                                     data.mipLevel,
+                                     m_numMipmaps);
+        return;
+    }
+    // layer must index a declared slice (1 for non-array/non-cube).
+    if (data.layer >= m_depthOrArrayLayers)
+    {
+        m_vkOreContext->setLastError(
+            "upload: layer (%u) >= depthOrArrayLayers (%u)",
+            data.layer,
+            m_depthOrArrayLayers);
+        return;
+    }
+    // Mip-adjusted extents (Vulkan-spec floor(max(1, dim >> mipLevel))).
+    const uint32_t mipWidth =
+        (m_width >> data.mipLevel) > 0 ? (m_width >> data.mipLevel) : 1u;
+    const uint32_t mipHeight =
+        (m_height >> data.mipLevel) > 0 ? (m_height >> data.mipLevel) : 1u;
+    const uint32_t width = data.width > 0 ? data.width : mipWidth;
+    const uint32_t height = data.height > 0 ? data.height : mipHeight;
+    // imageExtent.depth is the copy's z-extent. The array slice is chosen via
+    // baseArrayLayer with layerCount 1, so only true 3D textures carry depth
+    // greater than 1. Defaulting to m_depthOrArrayLayers would set depth to the
+    // array count for array2D/cube and over-read the staging source.
+    const uint32_t maxDepth =
+        m_type == TextureType::texture3D ? m_depthOrArrayLayers : 1u;
+    const uint32_t depth = data.depth > 0 ? data.depth : maxDepth;
+    // Region must fit within the mip's extent. 64-bit so a large x/y offset
+    // can't wrap past the guard.
+    if (static_cast<uint64_t>(data.x) + width > mipWidth ||
+        static_cast<uint64_t>(data.y) + height > mipHeight)
+    {
+        m_vkOreContext->setLastError(
+            "upload: region (x=%u y=%u w=%u h=%u) out of bounds for "
+            "mip %u (%ux%u)",
+            data.x,
+            data.y,
+            width,
+            height,
+            data.mipLevel,
+            mipWidth,
+            mipHeight);
+        return;
+    }
+    // z-slice must fit the texture depth (1 for everything but 3D).
+    if (static_cast<uint64_t>(data.z) + depth > maxDepth)
+    {
+        m_vkOreContext->setLastError(
+            "upload: z-region (z=%u depth=%u) out of bounds (maxDepth=%u)",
+            data.z,
+            depth,
+            maxDepth);
+        return;
+    }
+    // bytesPerTexel == 0 means a block-compressed format. We don't have
+    // a block-size-aware path for bufferRowLength yet, so reject upfront.
+    if (bptVK == 0)
+    {
+        m_vkOreContext->setLastError(
+            "upload: block-compressed formats not yet supported");
+        return;
+    }
+    // Uncompressed: bytesPerRow must be a whole number of texels and
+    // cover at least width texels so bufferRowLength can encode the
+    // caller pitch and the GPU read stays inside the staging buffer.
+    if (data.bytesPerRow != 0 && (data.bytesPerRow % bptVK) != 0)
+    {
+        m_vkOreContext->setLastError(
+            "upload: bytesPerRow (%u) must be a whole number of texels "
+            "(bytesPerTexel=%u)",
+            data.bytesPerRow,
+            bptVK);
+        return;
+    }
+    if (data.bytesPerRow != 0 &&
+        data.bytesPerRow < static_cast<uint64_t>(width) * bptVK)
+    {
+        m_vkOreContext->setLastError(
+            "upload: bytesPerRow (%u) < width * bytesPerTexel (%llu)",
+            data.bytesPerRow,
+            static_cast<unsigned long long>(static_cast<uint64_t>(width) *
+                                            bptVK));
+        return;
+    }
+    // rowsPerImage is the per-slice stride; 0 means "use height", otherwise
+    // it must cover at least height rows.
+    if (data.rowsPerImage > 0 && data.rowsPerImage < height)
+    {
+        m_vkOreContext->setLastError("upload: rowsPerImage (%u) < height (%u)",
+                                     data.rowsPerImage,
+                                     height);
+        return;
+    }
+    // Derive row/total sizes in 64-bit so the tightly-packed fallback
+    // (width * bptVK) and the total can't wrap before the uint32_t guard.
+    // bytesPerRow == 0 means tightly packed.
+    const uint64_t bytesPerRow64 = data.bytesPerRow != 0
+                                       ? static_cast<uint64_t>(data.bytesPerRow)
+                                       : static_cast<uint64_t>(width) * bptVK;
+    const uint32_t rowsPerImage =
+        data.rowsPerImage > 0 ? data.rowsPerImage : height;
+    const VkDeviceSize uploadSize = bytesPerRow64 *
+                                    static_cast<uint64_t>(rowsPerImage) *
+                                    static_cast<uint64_t>(depth);
+    // BufferVulkan size is uint32_t; refuse larger uploads rather than
+    // silently truncating the staging copy.
+    if (uploadSize > UINT32_MAX)
+    {
+        m_vkOreContext->setLastError(
+            "upload: size (%llu) exceeds uint32_t staging buffer max",
+            static_cast<unsigned long long>(uploadSize));
+        return;
+    }
 
-    // Create a transient host-visible staging buffer.
+    auto stagingBuffer =
+        rcp<BufferVulkan>(new BufferVulkan(m_manager,
+                                           static_cast<uint32_t>(uploadSize),
+                                           BufferUsage::upload));
+    stagingBuffer->m_vk = m_vk;
+
     VkBufferCreateInfo bufCI{};
     bufCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
     bufCI.size = uploadSize;
@@ -184,50 +199,33 @@
     allocCI.usage = VMA_MEMORY_USAGE_CPU_ONLY;
     allocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
 
-    m_stagingBuffer =
-        rcp<BufferVulkan>(new BufferVulkan(m_manager,
-                                           static_cast<uint32_t>(uploadSize),
-                                           BufferUsage::upload));
-    m_stagingBuffer->m_vk = m_vk;
-
     VmaAllocationInfo allocInfo{};
-    vmaCreateBuffer(m_vk->allocator(),
-                    &bufCI,
-                    &allocCI,
-                    &m_stagingBuffer->m_vkBuffer,
-                    &m_stagingBuffer->m_vmaAllocation,
-                    &allocInfo);
-    m_stagingBuffer->m_vkMappedPtr = allocInfo.pMappedData;
+    VkResult vmaRes = vmaCreateBuffer(m_vk->allocator(),
+                                      &bufCI,
+                                      &allocCI,
+                                      &stagingBuffer->m_vkBuffer,
+                                      &stagingBuffer->m_vmaAllocation,
+                                      &allocInfo);
+    if (vmaRes != VK_SUCCESS || allocInfo.pMappedData == nullptr)
+    {
+        m_vkOreContext->setLastError(
+            "upload: staging buffer allocation failed (size=%llu, vk=%d)",
+            static_cast<unsigned long long>(uploadSize),
+            static_cast<int>(vmaRes));
+        return;
+    }
+    stagingBuffer->m_vkMappedPtr = allocInfo.pMappedData;
 
-    m_stagingBuffer->update(data.data, static_cast<uint32_t>(uploadSize), 0);
+    stagingBuffer->update(data.data, static_cast<uint32_t>(uploadSize), 0);
 
-    // Transition to TRANSFER_DST.
-    transitionLayout(pfnCmdPipelineBarrier,
-                     cmdBuf,
-                     m_vkImage,
-                     aspectMask(m_format),
-                     m_vkLayout,
-                     VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                     m_numMipmaps,
-                     m_depthOrArrayLayers);
-    m_vkLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-
-    // Copy staging buffer → image. `bufferRowLength` and
-    // `bufferImageHeight` are in texels — 0 means "tightly packed at
-    // imageExtent". When the caller's source has padding (e.g. a
-    // sub-rect of a larger image, or `bytesPerRow > width *
-    // bytesPerTexel`), the staging copy must honour that pitch or the
-    // image comes back with the padding bytes interleaved into every
-    // other row. Pre-fix this was hardcoded to 0; the
-    // `ore_array_upload` GM (Phase 4 witness) caught the regression
-    // on every Android Vulkan target.
-    const uint32_t bptVK = textureFormatBytesPerTexel(m_format);
+    // bufferRowLength / bufferImageHeight are in texels; 0 means tightly
+    // packed at imageExtent. Honour caller pitch when present. Upstream
+    // guards make the div-by-bptVK safe. The `ore_array_upload` GM locks
+    // this — without it the GL backend silently strides wrong.
     VkBufferImageCopy region{};
     region.bufferOffset = 0;
     region.bufferRowLength =
-        (data.bytesPerRow != 0 && bptVK != 0 && (data.bytesPerRow % bptVK) == 0)
-            ? data.bytesPerRow / bptVK
-            : 0;
+        data.bytesPerRow != 0 ? data.bytesPerRow / bptVK : 0;
     region.bufferImageHeight = data.rowsPerImage;
     region.imageSubresource.aspectMask = aspectMask(m_format);
     region.imageSubresource.mipLevel = data.mipLevel;
@@ -236,25 +234,14 @@
     region.imageOffset = {static_cast<int32_t>(data.x),
                           static_cast<int32_t>(data.y),
                           static_cast<int32_t>(data.z)};
-    region.imageExtent = {data.width > 0 ? data.width : m_width, height, depth};
+    region.imageExtent = {width, height, depth};
 
-    pfnCmdCopyBufferToImage(cmdBuf,
-                            m_stagingBuffer->m_vkBuffer,
-                            m_vkImage,
-                            VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                            1,
-                            &region);
-
-    // Transition back to SHADER_READ_ONLY.
-    transitionLayout(pfnCmdPipelineBarrier,
-                     cmdBuf,
-                     m_vkImage,
-                     aspectMask(m_format),
-                     VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                     VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
-                     m_numMipmaps,
-                     m_depthOrArrayLayers);
-    m_vkLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+    m_vkOreContext->vkQueuePendingTextureUpload({
+        ref_rcp(this),
+        std::move(stagingBuffer),
+        region,
+        aspectMask(m_format),
+    });
 }
 
 TextureVulkan::~TextureVulkan()
diff --git a/renderer/src/ore/vulkan/ore_texture_vulkan.hpp b/renderer/src/ore/vulkan/ore_texture_vulkan.hpp
index 88ec78a..6aa349c 100644
--- a/renderer/src/ore/vulkan/ore_texture_vulkan.hpp
+++ b/renderer/src/ore/vulkan/ore_texture_vulkan.hpp
@@ -27,7 +27,6 @@
     VkImageLayout m_vkLayout = VK_IMAGE_LAYOUT_UNDEFINED;
     VkDevice m_vkDevice = VK_NULL_HANDLE; // Weak ref.
     rcp<rive::gpu::VulkanContext> m_vk;
-    rcp<BufferVulkan> m_stagingBuffer;
     // Back-ref so upload() can route through ContextVulkan. Weak ref.
     ContextVulkan* m_vkOreContext = nullptr;
 };
diff --git a/src/lua/renderer/lua_gpu.cpp b/src/lua/renderer/lua_gpu.cpp
index 8c2eae4..9ed8ea7 100644
--- a/src/lua/renderer/lua_gpu.cpp
+++ b/src/lua/renderer/lua_gpu.cpp
@@ -113,6 +113,10 @@
             return "r16float";
         case TextureFormat::rgba32float:
             return "rgba32float";
+        case TextureFormat::rg32float:
+            return "rg32float";
+        case TextureFormat::r32float:
+            return "r32float";
         case TextureFormat::rgb10a2unorm:
             return "rgb10a2unorm";
         case TextureFormat::r11g11b10float:
@@ -130,6 +134,34 @@
     }
 }
 
+// Float color render-target capability a format needs, or none if it is not a
+// float format. 16-bit floats need colorBufferHalfFloat; 32-bit and packed
+// floats need the full colorBufferFloat.
+enum class FloatColorClass
+{
+    none,
+    half,
+    full,
+};
+
+static FloatColorClass floatColorClass(TextureFormat fmt)
+{
+    switch (fmt)
+    {
+        case TextureFormat::rgba16float:
+        case TextureFormat::rg16float:
+        case TextureFormat::r16float:
+            return FloatColorClass::half;
+        case TextureFormat::rgba32float:
+        case TextureFormat::rg32float:
+        case TextureFormat::r32float:
+        case TextureFormat::r11g11b10float:
+            return FloatColorClass::full;
+        default:
+            return FloatColorClass::none;
+    }
+}
+
 static TextureType lua_totexturetype(lua_State* L, const char* s)
 {
     if (strcmp(s, "2d") == 0)
@@ -1131,6 +1163,29 @@
         lua_getoptionalnumberfield(L, descIdx, "layers", 1));
 
     auto* ctx = getOreContext(L);
+    // Gate float render targets: without the matching capability they make an
+    // incomplete FBO that renders black. Sampled-only float textures are fine.
+    // 16-bit floats need half-float, 32-bit and packed need full float.
+    if (desc.renderTarget)
+    {
+        FloatColorClass fc = floatColorClass(desc.format);
+        const Features& feat = ctx->features();
+        bool ok = fc == FloatColorClass::none ||
+                  (fc == FloatColorClass::half && feat.colorBufferHalfFloat) ||
+                  (fc == FloatColorClass::full && feat.colorBufferFloat);
+        if (!ok)
+        {
+            const char* cap = fc == FloatColorClass::half
+                                  ? "colorBufferHalfFloat"
+                                  : "colorBufferFloat";
+            luaL_error(L,
+                       "GPUTexture.new: float format %s as a renderTarget "
+                       "requires the %s feature, which the active backend does "
+                       "not support",
+                       lua_totextureformatstring(desc.format),
+                       cap);
+        }
+    }
     ctx->clearLastError();
     auto texture = ctx->makeTexture(desc);
     if (!texture)
@@ -2511,6 +2566,14 @@
         lua_isnumber(L, 5) ? static_cast<int32_t>(lua_tointeger(L, 5)) : 0;
     uint32_t firstInstance =
         lua_isnumber(L, 6) ? static_cast<uint32_t>(lua_tonumber(L, 6)) : 0;
+    if (baseVertex != 0 && !getOreContext(L)->features().drawBaseInstance)
+    {
+        luaL_error(L,
+                   "drawIndexed: baseVertex=%d requires the "
+                   "drawBaseInstance feature, which the active backend "
+                   "does not support",
+                   baseVertex);
+    }
     if (firstInstance > 0 && !getOreContext(L)->features().drawBaseInstance)
     {
         luaL_error(L,
diff --git a/tests/gm/ore_array_upload.cpp b/tests/gm/ore_array_upload.cpp
index e189cf8..80b7eb4 100644
--- a/tests/gm/ore_array_upload.cpp
+++ b/tests/gm/ore_array_upload.cpp
@@ -24,6 +24,11 @@
  *      times in stripes (or just garbage) instead of the intended
  *      solid-color square.
  *
+ *   3. Partial-region uploads. Layer 0 is written as four 2×2 sub-rect
+ *      tiles, so it samples identically but every backend's sub-region
+ *      path runs. Witnesses the D3D12 full-subresource over-read (pre-fix
+ *      it read the whole mip out of each small tile buffer).
+ *
  * Layer colors (mip 0):
  *   layer 0 → red    (255,   0,   0)
  *   layer 1 → green  (  0, 255,   0)
@@ -101,6 +106,29 @@
         }
     }
 }
+
+// 2×2 RGBA8 tile, non-tightly-packed (8 valid + 8 padding) so partial-region
+// uploads still exercise the pitch plumbing.
+static constexpr uint32_t kTileWidth = 2;
+static constexpr uint32_t kTileHeight = 2;
+static constexpr uint32_t kTilePaddedRowBytes = 16; // 2 * 4 valid + 8 padding.
+static constexpr uint32_t kTilePaddedBytes = kTilePaddedRowBytes * kTileHeight;
+
+static void fillSolidTile(uint8_t* dst, uint8_t r, uint8_t g, uint8_t b)
+{
+    memset(dst, 0xCC, kTilePaddedBytes); // padding sentinel.
+    for (uint32_t y = 0; y < kTileHeight; ++y)
+    {
+        uint8_t* row = dst + y * kTilePaddedRowBytes;
+        for (uint32_t x = 0; x < kTileWidth; ++x)
+        {
+            row[x * 4 + 0] = r;
+            row[x * 4 + 1] = g;
+            row[x * 4 + 2] = b;
+            row[x * 4 + 3] = 0xFF;
+        }
+    }
+}
 #endif
 
 class OreArrayUploadGM : public GM
@@ -211,12 +239,13 @@
         sampBGDesc.samplerCount = 1;
         auto sampBG = ctx.makeBindGroup(sampBGDesc);
 
-        m_ore.beginFrame(renderContext);
-
-        // Per-layer uploads. Each upload targets `(mipLevel=0, layer=N)`
-        // with a non-tightly-packed source (bytesPerRow = 32, double
-        // the tight row of 16) so the GL backend's GL_UNPACK_ROW_LENGTH
-        // plumbing is exercised.
+        // Per-layer uploads issued BEFORE m_ore.beginFrame() so the
+        // Vulkan backend's deferred-upload path is exercised (regression
+        // for the post-#12661 fix: upload outside an Ore frame must
+        // queue and drain at the next beginFrame / beginRenderPass).
+        // Each upload targets `(mipLevel=0, layer=N)` with a non-tightly-
+        // packed source (bytesPerRow = 32, double the tight row of 16)
+        // so the GL backend's GL_UNPACK_ROW_LENGTH plumbing is exercised.
         uint8_t layerData[kPaddedLayerBytes];
         struct LayerColor
         {
@@ -230,6 +259,39 @@
         };
         for (uint32_t layer = 0; layer < 4; ++layer)
         {
+            // Layer 0 as four 2×2 sub-rect tiles. Same solid-red result, but
+            // exercises every backend's partial-region path. Pre-fix D3D12
+            // over-read the whole mip out of each tile buffer.
+            if (layer == 0)
+            {
+                const uint32_t kTileOffsets[4][2] = {
+                    {0, 0},
+                    {kTileWidth, 0},
+                    {0, kTileHeight},
+                    {kTileWidth, kTileHeight},
+                };
+                uint8_t tileData[kTilePaddedBytes];
+                for (const auto& off : kTileOffsets)
+                {
+                    fillSolidTile(tileData,
+                                  kColors[0].r,
+                                  kColors[0].g,
+                                  kColors[0].b);
+                    TextureDataDesc upload{};
+                    upload.data = tileData;
+                    upload.bytesPerRow = kTilePaddedRowBytes;
+                    upload.rowsPerImage = kTileHeight;
+                    upload.mipLevel = 0;
+                    upload.layer = 0;
+                    upload.x = off[0];
+                    upload.y = off[1];
+                    upload.width = kTileWidth;
+                    upload.height = kTileHeight;
+                    upload.depth = 1;
+                    arrTex->upload(upload);
+                }
+                continue;
+            }
             fillSolidLayer(layerData,
                            kColors[layer].r,
                            kColors[layer].g,
@@ -246,6 +308,8 @@
             arrTex->upload(upload);
         }
 
+        m_ore.beginFrame(renderContext);
+
         ColorAttachment ca{};
         ca.view = canvasView.get();
         ca.loadOp = LoadOp::clear;