ore: gate float render targets and baseVertex on backend features (#12712) 10e90773d5 * ore: gate float render targets and baseVertex on backend features * ore: test float renderTarget and baseVertex feature guards fix(ore): d3d12 upload stages caller region, not full subresource (#12702) 0cc614761c fix(ore): ContextVulkan TextureVulkan upload defer to next recording CB (#12700) cccee5e480 * fix ContextVulkan TextureVulkan upload defer to next recording CB * ore vulkan upload: batch flush barriers, fix array depth default, 64-bit region bounds * ore vulkan upload: 64-bit bytesPerRow fallback before uint32_t guard Co-authored-by: Luigi Rosso <luigi-rosso@users.noreply.github.com>
diff --git a/.rive_head b/.rive_head index 8ff6a2b..1ac6e20 100644 --- a/.rive_head +++ b/.rive_head
@@ -1 +1 @@ -557c18ea05eb537d3812cebec0564a47218d8f67 +10e90773d57ba32506366d563cd738f4136f8e3b
diff --git a/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp b/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp index 1caa2a6..907b6ed 100644 --- a/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp +++ b/renderer/include/rive/renderer/ore/ore_context_vulkan.hpp
@@ -193,6 +193,21 @@ void vkQueueTransitionToLayout(Texture* texture, VkImageAspectFlags aspectMask, VkImageLayout newLayout); + + // Deferred texture uploads: upload() can be called with no recording + // CB (verify hooks, scripted shader setup). Drained on the host CB + // at the next beginFrame / beginRenderPass. + struct VkPendingTextureUpload + { + // Base types so this header doesn't need TextureVulkan / BufferVulkan. + rcp<Texture> texture; + rcp<Buffer> stagingBuffer; + VkBufferImageCopy region; + VkImageAspectFlags aspectMask; + }; + std::vector<VkPendingTextureUpload> m_vkPendingTextureUploads; + void vkQueuePendingTextureUpload(VkPendingTextureUpload pending); + void vkFlushPendingTextureUploads(); }; } // namespace rive::ore
diff --git a/renderer/include/rive/renderer/ore/ore_types.hpp b/renderer/include/rive/renderer/ore/ore_types.hpp index 3f5e4d9..c6d4d7b 100644 --- a/renderer/include/rive/renderer/ore/ore_types.hpp +++ b/renderer/include/rive/renderer/ore/ore_types.hpp
@@ -710,7 +710,11 @@ struct Features { + // 32-bit float color render targets (rgba32float etc). bool colorBufferFloat = false; + // 16-bit float color render targets (rgba16float etc). Implied by + // colorBufferFloat, but on WebGL can be present without it. + bool colorBufferHalfFloat = false; bool perTargetBlend = false; bool perTargetWriteMask = false; bool textureViewSampling = false;
diff --git a/renderer/src/ore/d3d12/ore_texture_d3d12.cpp b/renderer/src/ore/d3d12/ore_texture_d3d12.cpp index bf86c6f..1d3c710 100644 --- a/renderer/src/ore/d3d12/ore_texture_d3d12.cpp +++ b/renderer/src/ore/d3d12/ore_texture_d3d12.cpp
@@ -10,6 +10,7 @@ #include <d3d12.h> #include <wrl/client.h> #include <cassert> +#include <cstdint> #include <cstring> using Microsoft::WRL::ComPtr; @@ -20,42 +21,139 @@ void TextureD3D12::upload(const TextureDataDesc& data) { #if defined(ORE_BACKEND_D3D12) + assert(m_d3dOreContext != nullptr); + ContextD3D12* ctx = m_d3dOreContext; + // Internal invariants: a live, non-external texture always has these. assert(m_d3dTexture != nullptr); assert(m_d3dDevice != nullptr); - assert(m_d3dOreContext != nullptr); - assert(!m_d3dIsExternal && - "Cannot upload into an external (canvas-wrapped) texture"); - assert(data.data != nullptr); - ContextD3D12* ctx = m_d3dOreContext; + // Script-reachable failures report through lastError instead of aborting. + if (m_d3dIsExternal) + { + ctx->setLastError( + "upload: cannot upload into an external (canvas-wrapped) texture"); + return; + } + if (data.data == nullptr) + { + ctx->setLastError("upload: data is null"); + return; + } - // Compute the placed footprint for the requested subresource. - // D3D12 subresource index formula: - // `mipSlice + arraySlice * MipLevels + planeSlice * MipLevels * - // ArraySize` - // (see D3D12CalcSubresource in d3dx12.h). Pre-fix this was - // `mipLevel * DepthOrArraySize + layer`, which is the *transposed* - // formula — works only when MipLevels=1 (every Ore GM today) and - // silently mis-targets the wrong subresource for any mipmapped - // array texture. + const uint32_t bpt = textureFormatBytesPerTexel(m_format); + // mipLevel must index a declared level. + if (data.mipLevel >= m_numMipmaps) + { + ctx->setLastError("upload: mipLevel (%u) >= numMipmaps (%u)", + data.mipLevel, + m_numMipmaps); + return; + } + // layer must index a declared slice (1 for non-array/non-cube). + if (data.layer >= m_depthOrArrayLayers) + { + ctx->setLastError("upload: layer (%u) >= depthOrArrayLayers (%u)", + data.layer, + m_depthOrArrayLayers); + return; + } + // Mip-adjusted extents (floor(max(1, dim >> mipLevel))). + const uint32_t mipWidth = + (m_width >> data.mipLevel) > 0 ? (m_width >> data.mipLevel) : 1u; + const uint32_t mipHeight = + (m_height >> data.mipLevel) > 0 ? (m_height >> data.mipLevel) : 1u; + const uint32_t width = data.width > 0 ? data.width : mipWidth; + const uint32_t height = data.height > 0 ? data.height : mipHeight; + // Only 3D carries depth > 1. The 2D family picks its slice via the + // subresource index and uploads at depth 1. + const uint32_t maxDepth = + m_type == TextureType::texture3D ? m_depthOrArrayLayers : 1u; + const uint32_t depth = data.depth > 0 ? data.depth : maxDepth; + // Region must fit the mip extent. 64-bit so a big offset can't wrap. + if (static_cast<uint64_t>(data.x) + width > mipWidth || + static_cast<uint64_t>(data.y) + height > mipHeight) + { + ctx->setLastError("upload: region (x=%u y=%u w=%u h=%u) out of bounds " + "for mip %u (%ux%u)", + data.x, + data.y, + width, + height, + data.mipLevel, + mipWidth, + mipHeight); + return; + } + if (static_cast<uint64_t>(data.z) + depth > maxDepth) + { + ctx->setLastError( + "upload: z-region (z=%u depth=%u) out of bounds (maxDepth=%u)", + data.z, + depth, + maxDepth); + return; + } + // bytesPerTexel == 0 is block-compressed. No block-aware path yet. + if (bpt == 0) + { + ctx->setLastError("upload: block-compressed formats not yet supported"); + return; + } + if (data.bytesPerRow != 0 && (data.bytesPerRow % bpt) != 0) + { + ctx->setLastError("upload: bytesPerRow (%u) must be a whole number of " + "texels (bytesPerTexel=%u)", + data.bytesPerRow, + bpt); + return; + } + if (data.bytesPerRow != 0 && + data.bytesPerRow < static_cast<uint64_t>(width) * bpt) + { + ctx->setLastError( + "upload: bytesPerRow (%u) < width * bytesPerTexel (%llu)", + data.bytesPerRow, + static_cast<unsigned long long>(static_cast<uint64_t>(width) * + bpt)); + return; + } + if (data.rowsPerImage > 0 && data.rowsPerImage < height) + { + ctx->setLastError("upload: rowsPerImage (%u) < height (%u)", + data.rowsPerImage, + height); + return; + } + + // Subresource index: mipSlice + arraySlice * MipLevels. Transposing it + // only works when MipLevels == 1. D3D12_RESOURCE_DESC texDesc = m_d3dTexture->GetDesc(); - UINT subresource = + const UINT subresource = data.mipLevel + data.layer * static_cast<UINT>(texDesc.MipLevels); - D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {}; - UINT numRows = 0; - UINT64 rowBytes = 0; - UINT64 totalBytes = 0; - m_d3dDevice->GetCopyableFootprints(&texDesc, - subresource, - 1, - /*baseOffset=*/0, - &footprint, - &numRows, - &rowBytes, - &totalBytes); + // Stage just the caller region: caller pitch on read, 256-aligned pitch on + // write. 64-bit so the total can't wrap before the uint32_t guard. + const uint64_t srcRow = data.bytesPerRow != 0 + ? data.bytesPerRow + : static_cast<uint64_t>(width) * bpt; + const uint32_t rowsPerImage = + data.rowsPerImage > 0 ? data.rowsPerImage : height; + const uint64_t srcSliceStride = srcRow * rowsPerImage; + const uint64_t copyRowBytes = static_cast<uint64_t>(width) * bpt; + const uint64_t dstRowPitch = + (copyRowBytes + (D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1)) & + ~static_cast<uint64_t>(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1); + const uint64_t totalBytes = dstRowPitch * height * depth; + // BufferD3D12 size is uint32_t. Refuse rather than truncate. + if (totalBytes > UINT32_MAX) + { + ctx->setLastError( + "upload: size (%llu) exceeds uint32_t staging buffer max", + static_cast<unsigned long long>(totalBytes)); + return; + } - // Allocate a staging (UPLOAD heap) buffer. + // Allocate a staging (UPLOAD heap) buffer sized to the region footprint. D3D12_HEAP_PROPERTIES uploadHeap = {}; uploadHeap.Type = D3D12_HEAP_TYPE_UPLOAD; @@ -75,28 +173,45 @@ static_cast<uint32_t>(totalBytes), BufferUsage::upload)); - [[maybe_unused]] HRESULT hr = m_d3dDevice->CreateCommittedResource( + HRESULT hr = m_d3dDevice->CreateCommittedResource( &uploadHeap, D3D12_HEAP_FLAG_NONE, &bufDesc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(m_uploadBuffer->m_d3dBuffer.GetAddressOf())); - assert(SUCCEEDED(hr) && "Texture::upload: staging buffer creation failed"); + if (FAILED(hr)) + { + m_uploadBuffer = nullptr; + ctx->setLastError("upload: staging buffer creation failed (hr=0x%08x)", + static_cast<unsigned>(hr)); + return; + } - // Map and copy row by row, respecting the D3D12 pitch alignment. void* mapped = nullptr; D3D12_RANGE readRange = {0, 0}; - m_uploadBuffer->m_d3dBuffer->Map(0, &readRange, &mapped); + hr = m_uploadBuffer->m_d3dBuffer->Map(0, &readRange, &mapped); + if (FAILED(hr) || mapped == nullptr) + { + m_uploadBuffer = nullptr; + ctx->setLastError("upload: staging buffer map failed (hr=0x%08x)", + static_cast<unsigned>(hr)); + return; + } + // Copy the region row by row, never the full subresource (the over-read). const uint8_t* src = static_cast<const uint8_t*>(data.data); uint8_t* dst = static_cast<uint8_t*>(mapped); - UINT64 srcRow = data.bytesPerRow ? data.bytesPerRow : rowBytes; - UINT64 dstRow = footprint.Footprint.RowPitch; - - for (UINT row = 0; row < numRows; ++row) - memcpy(dst + row * dstRow, src + row * srcRow, (size_t)rowBytes); - + for (uint32_t z = 0; z < depth; ++z) + { + for (uint32_t y = 0; y < height; ++y) + { + memcpy(dst + (static_cast<uint64_t>(z) * height + y) * dstRowPitch, + src + static_cast<uint64_t>(z) * srcSliceStride + + static_cast<uint64_t>(y) * srcRow, + static_cast<size_t>(copyRowBytes)); + } + } m_uploadBuffer->m_d3dBuffer->Unmap(0, nullptr); // If the texture is not in COPY_DEST state, transition it. @@ -113,27 +228,31 @@ m_d3dCurrentState = D3D12_RESOURCE_STATE_COPY_DEST; } - // Record the copy. + // Copy the whole staged region to (x, y, z). No src box needed. D3D12_TEXTURE_COPY_LOCATION dst_loc = {}; dst_loc.pResource = m_d3dTexture.Get(); dst_loc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; dst_loc.SubresourceIndex = subresource; + D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {}; + footprint.Offset = 0; + footprint.Footprint.Format = texDesc.Format; + footprint.Footprint.Width = width; + footprint.Footprint.Height = height; + footprint.Footprint.Depth = depth; + footprint.Footprint.RowPitch = static_cast<UINT>(dstRowPitch); + D3D12_TEXTURE_COPY_LOCATION src_loc = {}; src_loc.pResource = m_uploadBuffer->m_d3dBuffer.Get(); src_loc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; src_loc.PlacedFootprint = footprint; - D3D12_BOX box = {}; - box.left = data.x; - box.top = data.y; - box.front = data.z; - box.right = data.x + data.width; - box.bottom = data.y + data.height; - box.back = data.z + data.depth; - - ctx->m_d3dCmdList - ->CopyTextureRegion(&dst_loc, data.x, data.y, data.z, &src_loc, &box); + ctx->m_d3dCmdList->CopyTextureRegion(&dst_loc, + data.x, + data.y, + data.z, + &src_loc, + nullptr); // Transition to PIXEL_SHADER_RESOURCE so it's ready to sample without // an explicit barrier in the render pass.
diff --git a/renderer/src/ore/gl/ore_context_gl.cpp b/renderer/src/ore/gl/ore_context_gl.cpp index 4ef34d5..52da6a4 100644 --- a/renderer/src/ore/gl/ore_context_gl.cpp +++ b/renderer/src/ore/gl/ore_context_gl.cpp
@@ -239,7 +239,11 @@ if (!ext) continue; if (strcmp(ext, "GL_EXT_color_buffer_float") == 0) + { + // Full-float renderability implies half-float renderability. f.colorBufferFloat = true; + f.colorBufferHalfFloat = true; + } else if (strcmp(ext, "GL_EXT_texture_filter_anisotropic") == 0) f.anisotropicFiltering = true; else if (strcmp(ext, "GL_KHR_texture_compression_astc_ldr") == 0) @@ -250,6 +254,27 @@ #endif } + // drawBaseInstance stays false: the GL render pass does not yet wire + // baseVertex/firstInstance through the base-vertex-base-instance draw + // call, so reporting it true would silently drop those args. + +#ifdef RIVE_WEBGL + // On WebGL an extension must be enabled before its functionality (and + // sometimes its name in the extension list) is available, so the scan + // above misses it. Enable returns whether the browser supports it. + // The two extensions are independent: half-float gates 16-bit float + // targets, full-float gates 32-bit, and full-float implies half-float. + auto webglCtx = emscripten_webgl_get_current_context(); + if (emscripten_webgl_enable_extension(webglCtx, "EXT_color_buffer_float")) + { + f.colorBufferFloat = true; + f.colorBufferHalfFloat = true; + } + if (emscripten_webgl_enable_extension(webglCtx, + "EXT_color_buffer_half_float")) + f.colorBufferHalfFloat = true; +#endif + return ctx; }
diff --git a/renderer/src/ore/gl/ore_render_pass_gl.cpp b/renderer/src/ore/gl/ore_render_pass_gl.cpp index aa16243..2c12e2b 100644 --- a/renderer/src/ore/gl/ore_render_pass_gl.cpp +++ b/renderer/src/ore/gl/ore_render_pass_gl.cpp
@@ -499,6 +499,8 @@ assert(m_currentPipeline != nullptr); GLenum mode = oreTopologyToGL(m_currentPipeline->desc().topology); + // drawBaseInstance=false on GL, so the Lua guard rejects firstInstance>0. + // Wiring needs glDrawArraysInstancedBaseInstance (ANGLE). (void)firstInstance; if (instanceCount > 1) @@ -524,6 +526,9 @@ const void* offset = reinterpret_cast<const void*>( static_cast<uintptr_t>(firstIndex * indexSize)); + // drawBaseInstance=false on GL, so the Lua guard rejects baseVertex!=0 and + // firstInstance>0. Wiring needs + // glDrawElementsInstancedBaseVertexBaseInstance (ANGLE). (void)baseVertex; (void)firstInstance;
diff --git a/renderer/src/ore/vulkan/ore_context_vulkan.cpp b/renderer/src/ore/vulkan/ore_context_vulkan.cpp index bc1c8a5..7c00f05 100644 --- a/renderer/src/ore/vulkan/ore_context_vulkan.cpp +++ b/renderer/src/ore/vulkan/ore_context_vulkan.cpp
@@ -506,6 +506,152 @@ m_vkPendingInitialTransitions.clear(); } +void ContextVulkan::vkQueuePendingTextureUpload(VkPendingTextureUpload pending) +{ + // No dedupe: uploads target sub-regions (offset, mipLevel, layer), + // so multiple calls to the same texture are all meaningful. + m_vkPendingTextureUploads.push_back(std::move(pending)); +} + +void ContextVulkan::vkFlushPendingTextureUploads() +{ + if (m_vkPendingTextureUploads.empty()) + return; + // Caller contract: only invoked from beginFrame / beginRenderPass, + // both of which run with a host-owned recording CB. + assert(m_vkCmdBufRecording && m_vkCommandBuffer != VK_NULL_HANDLE); + + // Resolve the valid uploads and collect the distinct destination images. + // Barriers batch per image (one to-transfer + one to-shader for the whole + // flush) instead of two per upload, so an N-layer array texture costs 2 + // barriers, not 2N. + struct DistinctTex + { + TextureVulkan* tex; + VkImageLayout oldLayout; + VkImageAspectFlags aspectMask; + }; + std::vector<DistinctTex> distinct; + struct ValidUpload + { + VkBuffer srcBuffer; + VkImage dstImage; + const VkBufferImageCopy* region; + }; + std::vector<ValidUpload> uploads; + uploads.reserve(m_vkPendingTextureUploads.size()); + for (auto& pu : m_vkPendingTextureUploads) + { + auto* vkTex = lite_rtti_cast<TextureVulkan*>(pu.texture.get()); + auto* vkBuf = lite_rtti_cast<BufferVulkan*>(pu.stagingBuffer.get()); + // Texture or staging buffer torn down before flush. Skip silently. + if (vkTex == nullptr || vkBuf == nullptr || + vkTex->m_vkImage == VK_NULL_HANDLE) + continue; + uploads.push_back({vkBuf->m_vkBuffer, vkTex->m_vkImage, &pu.region}); + bool found = false; + for (auto& d : distinct) + { + if (d.tex == vkTex) + { + d.aspectMask |= pu.aspectMask; + found = true; + break; + } + } + if (!found) + distinct.push_back({vkTex, vkTex->m_vkLayout, pu.aspectMask}); + } + if (uploads.empty()) + { + m_vkPendingTextureUploads.clear(); + return; + } + + // 1) Transition every distinct image to TRANSFER_DST in one barrier. The + // src scope comes from each image's tracked layout so the upload waits on + // any prior use (attachment write, shader read) instead of racing it. + std::vector<VkImageMemoryBarrier> toTransfer; + toTransfer.reserve(distinct.size()); + VkPipelineStageFlags srcStageAcc = 0; + for (auto& d : distinct) + { + VkPipelineStageFlags srcStage; + VkAccessFlags srcAccess; + pipelineStageAndAccessForLayout(d.oldLayout, &srcStage, &srcAccess); + srcStageAcc |= srcStage; + VkImageMemoryBarrier b{}; + b.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + b.srcAccessMask = srcAccess; + b.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + b.oldLayout = d.oldLayout; + b.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + b.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + b.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + b.image = d.tex->m_vkImage; + b.subresourceRange.aspectMask = d.aspectMask; + b.subresourceRange.baseMipLevel = 0; + b.subresourceRange.levelCount = VK_REMAINING_MIP_LEVELS; + b.subresourceRange.baseArrayLayer = 0; + b.subresourceRange.layerCount = VK_REMAINING_ARRAY_LAYERS; + toTransfer.push_back(b); + } + m_vk->CmdPipelineBarrier(m_vkCommandBuffer, + srcStageAcc, + VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, + 0, + nullptr, + 0, + nullptr, + static_cast<uint32_t>(toTransfer.size()), + toTransfer.data()); + + // 2) Copy each staged region. Disjoint sub-regions (distinct offset / mip + // / layer) need no inter-copy barrier. + for (auto& u : uploads) + { + m_vk->CmdCopyBufferToImage(m_vkCommandBuffer, + u.srcBuffer, + u.dstImage, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + 1, + u.region); + } + + // 3) Transition every distinct image back to SHADER_READ_ONLY in one + // barrier. + VkPipelineStageFlags dstStage; + VkAccessFlags dstAccess; + pipelineStageAndAccessForLayout(VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + &dstStage, + &dstAccess); + std::vector<VkImageMemoryBarrier> toShader = toTransfer; + for (auto& b : toShader) + { + b.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + b.dstAccessMask = dstAccess; + b.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + b.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + } + m_vk->CmdPipelineBarrier(m_vkCommandBuffer, + VK_PIPELINE_STAGE_TRANSFER_BIT, + dstStage, + 0, + 0, + nullptr, + 0, + nullptr, + static_cast<uint32_t>(toShader.size()), + toShader.data()); + for (auto& d : distinct) + d.tex->m_vkLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + // Dropping the rcps releases the staging buffers; GPUResource + // purgatory keeps them alive until safeFrameNumber. + m_vkPendingTextureUploads.clear(); +} + void ContextVulkan::beginFrame(const FrameDescriptor& desc) { assert(desc.externalCommandBuffer != VK_NULL_HANDLE); @@ -519,8 +665,8 @@ static_cast<VkCommandBuffer>(desc.externalCommandBuffer); m_vkCmdBufRecording = true; - // Flush any lazy-init barriers queued before this frame. Host has - // already called BeginCommandBuffer on externalCb, so recording is safe. + // Drain pre-frame deferred work onto the host's CB. + vkFlushPendingTextureUploads(); vkFlushPendingInitialTransitions(); } @@ -1409,10 +1555,9 @@ rpBI.clearValueCount = attachCount; rpBI.pClearValues = clearValues; - // Flush any pending sampled-image transitions (queued by makeBindGroup - // and the loadOp=load attachment branches above) before entering the - // pass — barriers inside a pass are restricted to declared self- - // dependencies, so cross-pass image-layout work must land here. + // Drain barriers and uploads outside the pass; transfers and + // unrestricted barriers are not allowed inside one. + vkFlushPendingTextureUploads(); vkFlushPendingInitialTransitions(); m_vk->CmdBeginRenderPass(m_vkCommandBuffer,
diff --git a/renderer/src/ore/vulkan/ore_texture_vulkan.cpp b/renderer/src/ore/vulkan/ore_texture_vulkan.cpp index 5ecaea6..50ce778 100644 --- a/renderer/src/ore/vulkan/ore_texture_vulkan.cpp +++ b/renderer/src/ore/vulkan/ore_texture_vulkan.cpp
@@ -49,132 +49,147 @@ return VK_IMAGE_ASPECT_COLOR_BIT; } -// Transition an image from one layout to another using a pipeline barrier -// on the given command buffer. -static void transitionLayout(PFN_vkCmdPipelineBarrier pfnCmdPipelineBarrier, - VkCommandBuffer cmdBuf, - VkImage image, - VkImageAspectFlags aspectMask, - VkImageLayout oldLayout, - VkImageLayout newLayout, - uint32_t mipCount = 1, - uint32_t layerCount = 1) -{ - VkImageMemoryBarrier barrier{}; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = image; - barrier.subresourceRange.aspectMask = aspectMask; - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = mipCount; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = layerCount; - - VkPipelineStageFlags srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - VkPipelineStageFlags dstStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - - if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) - { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - dstStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - } - else if (oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) - { - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - srcStage = VK_PIPELINE_STAGE_TRANSFER_BIT; - dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - } - else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) - { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - } - else if (oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL && - newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) - { - barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - srcStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - dstStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - } - else if (oldLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) - { - barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - srcStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - } - else if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) - { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - srcStage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - dstStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT; - } - else - { - // Generic fallback — conservative but correct. - barrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT; - barrier.dstAccessMask = - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT; - srcStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; - dstStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; - } - - pfnCmdPipelineBarrier(cmdBuf, - srcStage, - dstStage, - 0, - 0, - nullptr, - 0, - nullptr, - 1, - &barrier); -} - // ============================================================================ // Texture // ============================================================================ void TextureVulkan::upload(const TextureDataDesc& data) { - // Staging upload records into whichever command buffer the owning - // Context currently has open — either Ore's own CB (owned-CB mode) or - // the host's CB (external-CB mode). Resolved at call time so the same - // Texture can span both modes across its lifetime. + // Stage CPU-side, queue for the next host CB. Callers may run with + // no recording CB (verify hooks, scripted shader setup). assert(m_vkOreContext != nullptr); - // Make sure the owned cb is in the recording state — scripts can call - // upload() outside a host-driven frame window (verify hooks during - // artboard construction, scripted shader effects during PLS draw). - VkCommandBuffer cmdBuf = m_vkOreContext->m_vkCommandBuffer; - auto pfnCmdPipelineBarrier = m_vkOreContext->m_vk->CmdPipelineBarrier; - auto pfnCmdCopyBufferToImage = m_vkOreContext->m_vk->CmdCopyBufferToImage; - assert(cmdBuf != VK_NULL_HANDLE); assert(m_vkImage != VK_NULL_HANDLE); + if (data.data == nullptr) + { + m_vkOreContext->setLastError("upload: data is null"); + return; + } - // Determine byte size of the upload region. - uint32_t bytesPerRow = data.bytesPerRow; - uint32_t height = data.height > 0 ? data.height : m_height; - uint32_t depth = data.depth > 0 ? data.depth : m_depthOrArrayLayers; - VkDeviceSize uploadSize = - static_cast<VkDeviceSize>(bytesPerRow) * height * depth; + const uint32_t bptVK = textureFormatBytesPerTexel(m_format); + // mipLevel must index a declared level. + if (data.mipLevel >= m_numMipmaps) + { + m_vkOreContext->setLastError("upload: mipLevel (%u) >= numMipmaps (%u)", + data.mipLevel, + m_numMipmaps); + return; + } + // layer must index a declared slice (1 for non-array/non-cube). + if (data.layer >= m_depthOrArrayLayers) + { + m_vkOreContext->setLastError( + "upload: layer (%u) >= depthOrArrayLayers (%u)", + data.layer, + m_depthOrArrayLayers); + return; + } + // Mip-adjusted extents (Vulkan-spec floor(max(1, dim >> mipLevel))). + const uint32_t mipWidth = + (m_width >> data.mipLevel) > 0 ? (m_width >> data.mipLevel) : 1u; + const uint32_t mipHeight = + (m_height >> data.mipLevel) > 0 ? (m_height >> data.mipLevel) : 1u; + const uint32_t width = data.width > 0 ? data.width : mipWidth; + const uint32_t height = data.height > 0 ? data.height : mipHeight; + // imageExtent.depth is the copy's z-extent. The array slice is chosen via + // baseArrayLayer with layerCount 1, so only true 3D textures carry depth + // greater than 1. Defaulting to m_depthOrArrayLayers would set depth to the + // array count for array2D/cube and over-read the staging source. + const uint32_t maxDepth = + m_type == TextureType::texture3D ? m_depthOrArrayLayers : 1u; + const uint32_t depth = data.depth > 0 ? data.depth : maxDepth; + // Region must fit within the mip's extent. 64-bit so a large x/y offset + // can't wrap past the guard. + if (static_cast<uint64_t>(data.x) + width > mipWidth || + static_cast<uint64_t>(data.y) + height > mipHeight) + { + m_vkOreContext->setLastError( + "upload: region (x=%u y=%u w=%u h=%u) out of bounds for " + "mip %u (%ux%u)", + data.x, + data.y, + width, + height, + data.mipLevel, + mipWidth, + mipHeight); + return; + } + // z-slice must fit the texture depth (1 for everything but 3D). + if (static_cast<uint64_t>(data.z) + depth > maxDepth) + { + m_vkOreContext->setLastError( + "upload: z-region (z=%u depth=%u) out of bounds (maxDepth=%u)", + data.z, + depth, + maxDepth); + return; + } + // bytesPerTexel == 0 means a block-compressed format. We don't have + // a block-size-aware path for bufferRowLength yet, so reject upfront. + if (bptVK == 0) + { + m_vkOreContext->setLastError( + "upload: block-compressed formats not yet supported"); + return; + } + // Uncompressed: bytesPerRow must be a whole number of texels and + // cover at least width texels so bufferRowLength can encode the + // caller pitch and the GPU read stays inside the staging buffer. + if (data.bytesPerRow != 0 && (data.bytesPerRow % bptVK) != 0) + { + m_vkOreContext->setLastError( + "upload: bytesPerRow (%u) must be a whole number of texels " + "(bytesPerTexel=%u)", + data.bytesPerRow, + bptVK); + return; + } + if (data.bytesPerRow != 0 && + data.bytesPerRow < static_cast<uint64_t>(width) * bptVK) + { + m_vkOreContext->setLastError( + "upload: bytesPerRow (%u) < width * bytesPerTexel (%llu)", + data.bytesPerRow, + static_cast<unsigned long long>(static_cast<uint64_t>(width) * + bptVK)); + return; + } + // rowsPerImage is the per-slice stride; 0 means "use height", otherwise + // it must cover at least height rows. + if (data.rowsPerImage > 0 && data.rowsPerImage < height) + { + m_vkOreContext->setLastError("upload: rowsPerImage (%u) < height (%u)", + data.rowsPerImage, + height); + return; + } + // Derive row/total sizes in 64-bit so the tightly-packed fallback + // (width * bptVK) and the total can't wrap before the uint32_t guard. + // bytesPerRow == 0 means tightly packed. + const uint64_t bytesPerRow64 = data.bytesPerRow != 0 + ? static_cast<uint64_t>(data.bytesPerRow) + : static_cast<uint64_t>(width) * bptVK; + const uint32_t rowsPerImage = + data.rowsPerImage > 0 ? data.rowsPerImage : height; + const VkDeviceSize uploadSize = bytesPerRow64 * + static_cast<uint64_t>(rowsPerImage) * + static_cast<uint64_t>(depth); + // BufferVulkan size is uint32_t; refuse larger uploads rather than + // silently truncating the staging copy. + if (uploadSize > UINT32_MAX) + { + m_vkOreContext->setLastError( + "upload: size (%llu) exceeds uint32_t staging buffer max", + static_cast<unsigned long long>(uploadSize)); + return; + } - // Create a transient host-visible staging buffer. + auto stagingBuffer = + rcp<BufferVulkan>(new BufferVulkan(m_manager, + static_cast<uint32_t>(uploadSize), + BufferUsage::upload)); + stagingBuffer->m_vk = m_vk; + VkBufferCreateInfo bufCI{}; bufCI.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; bufCI.size = uploadSize; @@ -184,50 +199,33 @@ allocCI.usage = VMA_MEMORY_USAGE_CPU_ONLY; allocCI.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - m_stagingBuffer = - rcp<BufferVulkan>(new BufferVulkan(m_manager, - static_cast<uint32_t>(uploadSize), - BufferUsage::upload)); - m_stagingBuffer->m_vk = m_vk; - VmaAllocationInfo allocInfo{}; - vmaCreateBuffer(m_vk->allocator(), - &bufCI, - &allocCI, - &m_stagingBuffer->m_vkBuffer, - &m_stagingBuffer->m_vmaAllocation, - &allocInfo); - m_stagingBuffer->m_vkMappedPtr = allocInfo.pMappedData; + VkResult vmaRes = vmaCreateBuffer(m_vk->allocator(), + &bufCI, + &allocCI, + &stagingBuffer->m_vkBuffer, + &stagingBuffer->m_vmaAllocation, + &allocInfo); + if (vmaRes != VK_SUCCESS || allocInfo.pMappedData == nullptr) + { + m_vkOreContext->setLastError( + "upload: staging buffer allocation failed (size=%llu, vk=%d)", + static_cast<unsigned long long>(uploadSize), + static_cast<int>(vmaRes)); + return; + } + stagingBuffer->m_vkMappedPtr = allocInfo.pMappedData; - m_stagingBuffer->update(data.data, static_cast<uint32_t>(uploadSize), 0); + stagingBuffer->update(data.data, static_cast<uint32_t>(uploadSize), 0); - // Transition to TRANSFER_DST. - transitionLayout(pfnCmdPipelineBarrier, - cmdBuf, - m_vkImage, - aspectMask(m_format), - m_vkLayout, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - m_numMipmaps, - m_depthOrArrayLayers); - m_vkLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; - - // Copy staging buffer → image. `bufferRowLength` and - // `bufferImageHeight` are in texels — 0 means "tightly packed at - // imageExtent". When the caller's source has padding (e.g. a - // sub-rect of a larger image, or `bytesPerRow > width * - // bytesPerTexel`), the staging copy must honour that pitch or the - // image comes back with the padding bytes interleaved into every - // other row. Pre-fix this was hardcoded to 0; the - // `ore_array_upload` GM (Phase 4 witness) caught the regression - // on every Android Vulkan target. - const uint32_t bptVK = textureFormatBytesPerTexel(m_format); + // bufferRowLength / bufferImageHeight are in texels; 0 means tightly + // packed at imageExtent. Honour caller pitch when present. Upstream + // guards make the div-by-bptVK safe. The `ore_array_upload` GM locks + // this — without it the GL backend silently strides wrong. VkBufferImageCopy region{}; region.bufferOffset = 0; region.bufferRowLength = - (data.bytesPerRow != 0 && bptVK != 0 && (data.bytesPerRow % bptVK) == 0) - ? data.bytesPerRow / bptVK - : 0; + data.bytesPerRow != 0 ? data.bytesPerRow / bptVK : 0; region.bufferImageHeight = data.rowsPerImage; region.imageSubresource.aspectMask = aspectMask(m_format); region.imageSubresource.mipLevel = data.mipLevel; @@ -236,25 +234,14 @@ region.imageOffset = {static_cast<int32_t>(data.x), static_cast<int32_t>(data.y), static_cast<int32_t>(data.z)}; - region.imageExtent = {data.width > 0 ? data.width : m_width, height, depth}; + region.imageExtent = {width, height, depth}; - pfnCmdCopyBufferToImage(cmdBuf, - m_stagingBuffer->m_vkBuffer, - m_vkImage, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - 1, - ®ion); - - // Transition back to SHADER_READ_ONLY. - transitionLayout(pfnCmdPipelineBarrier, - cmdBuf, - m_vkImage, - aspectMask(m_format), - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, - m_numMipmaps, - m_depthOrArrayLayers); - m_vkLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + m_vkOreContext->vkQueuePendingTextureUpload({ + ref_rcp(this), + std::move(stagingBuffer), + region, + aspectMask(m_format), + }); } TextureVulkan::~TextureVulkan()
diff --git a/renderer/src/ore/vulkan/ore_texture_vulkan.hpp b/renderer/src/ore/vulkan/ore_texture_vulkan.hpp index 88ec78a..6aa349c 100644 --- a/renderer/src/ore/vulkan/ore_texture_vulkan.hpp +++ b/renderer/src/ore/vulkan/ore_texture_vulkan.hpp
@@ -27,7 +27,6 @@ VkImageLayout m_vkLayout = VK_IMAGE_LAYOUT_UNDEFINED; VkDevice m_vkDevice = VK_NULL_HANDLE; // Weak ref. rcp<rive::gpu::VulkanContext> m_vk; - rcp<BufferVulkan> m_stagingBuffer; // Back-ref so upload() can route through ContextVulkan. Weak ref. ContextVulkan* m_vkOreContext = nullptr; };
diff --git a/src/lua/renderer/lua_gpu.cpp b/src/lua/renderer/lua_gpu.cpp index 8c2eae4..9ed8ea7 100644 --- a/src/lua/renderer/lua_gpu.cpp +++ b/src/lua/renderer/lua_gpu.cpp
@@ -113,6 +113,10 @@ return "r16float"; case TextureFormat::rgba32float: return "rgba32float"; + case TextureFormat::rg32float: + return "rg32float"; + case TextureFormat::r32float: + return "r32float"; case TextureFormat::rgb10a2unorm: return "rgb10a2unorm"; case TextureFormat::r11g11b10float: @@ -130,6 +134,34 @@ } } +// Float color render-target capability a format needs, or none if it is not a +// float format. 16-bit floats need colorBufferHalfFloat; 32-bit and packed +// floats need the full colorBufferFloat. +enum class FloatColorClass +{ + none, + half, + full, +}; + +static FloatColorClass floatColorClass(TextureFormat fmt) +{ + switch (fmt) + { + case TextureFormat::rgba16float: + case TextureFormat::rg16float: + case TextureFormat::r16float: + return FloatColorClass::half; + case TextureFormat::rgba32float: + case TextureFormat::rg32float: + case TextureFormat::r32float: + case TextureFormat::r11g11b10float: + return FloatColorClass::full; + default: + return FloatColorClass::none; + } +} + static TextureType lua_totexturetype(lua_State* L, const char* s) { if (strcmp(s, "2d") == 0) @@ -1131,6 +1163,29 @@ lua_getoptionalnumberfield(L, descIdx, "layers", 1)); auto* ctx = getOreContext(L); + // Gate float render targets: without the matching capability they make an + // incomplete FBO that renders black. Sampled-only float textures are fine. + // 16-bit floats need half-float, 32-bit and packed need full float. + if (desc.renderTarget) + { + FloatColorClass fc = floatColorClass(desc.format); + const Features& feat = ctx->features(); + bool ok = fc == FloatColorClass::none || + (fc == FloatColorClass::half && feat.colorBufferHalfFloat) || + (fc == FloatColorClass::full && feat.colorBufferFloat); + if (!ok) + { + const char* cap = fc == FloatColorClass::half + ? "colorBufferHalfFloat" + : "colorBufferFloat"; + luaL_error(L, + "GPUTexture.new: float format %s as a renderTarget " + "requires the %s feature, which the active backend does " + "not support", + lua_totextureformatstring(desc.format), + cap); + } + } ctx->clearLastError(); auto texture = ctx->makeTexture(desc); if (!texture) @@ -2511,6 +2566,14 @@ lua_isnumber(L, 5) ? static_cast<int32_t>(lua_tointeger(L, 5)) : 0; uint32_t firstInstance = lua_isnumber(L, 6) ? static_cast<uint32_t>(lua_tonumber(L, 6)) : 0; + if (baseVertex != 0 && !getOreContext(L)->features().drawBaseInstance) + { + luaL_error(L, + "drawIndexed: baseVertex=%d requires the " + "drawBaseInstance feature, which the active backend " + "does not support", + baseVertex); + } if (firstInstance > 0 && !getOreContext(L)->features().drawBaseInstance) { luaL_error(L,
diff --git a/tests/gm/ore_array_upload.cpp b/tests/gm/ore_array_upload.cpp index e189cf8..80b7eb4 100644 --- a/tests/gm/ore_array_upload.cpp +++ b/tests/gm/ore_array_upload.cpp
@@ -24,6 +24,11 @@ * times in stripes (or just garbage) instead of the intended * solid-color square. * + * 3. Partial-region uploads. Layer 0 is written as four 2×2 sub-rect + * tiles, so it samples identically but every backend's sub-region + * path runs. Witnesses the D3D12 full-subresource over-read (pre-fix + * it read the whole mip out of each small tile buffer). + * * Layer colors (mip 0): * layer 0 → red (255, 0, 0) * layer 1 → green ( 0, 255, 0) @@ -101,6 +106,29 @@ } } } + +// 2×2 RGBA8 tile, non-tightly-packed (8 valid + 8 padding) so partial-region +// uploads still exercise the pitch plumbing. +static constexpr uint32_t kTileWidth = 2; +static constexpr uint32_t kTileHeight = 2; +static constexpr uint32_t kTilePaddedRowBytes = 16; // 2 * 4 valid + 8 padding. +static constexpr uint32_t kTilePaddedBytes = kTilePaddedRowBytes * kTileHeight; + +static void fillSolidTile(uint8_t* dst, uint8_t r, uint8_t g, uint8_t b) +{ + memset(dst, 0xCC, kTilePaddedBytes); // padding sentinel. + for (uint32_t y = 0; y < kTileHeight; ++y) + { + uint8_t* row = dst + y * kTilePaddedRowBytes; + for (uint32_t x = 0; x < kTileWidth; ++x) + { + row[x * 4 + 0] = r; + row[x * 4 + 1] = g; + row[x * 4 + 2] = b; + row[x * 4 + 3] = 0xFF; + } + } +} #endif class OreArrayUploadGM : public GM @@ -211,12 +239,13 @@ sampBGDesc.samplerCount = 1; auto sampBG = ctx.makeBindGroup(sampBGDesc); - m_ore.beginFrame(renderContext); - - // Per-layer uploads. Each upload targets `(mipLevel=0, layer=N)` - // with a non-tightly-packed source (bytesPerRow = 32, double - // the tight row of 16) so the GL backend's GL_UNPACK_ROW_LENGTH - // plumbing is exercised. + // Per-layer uploads issued BEFORE m_ore.beginFrame() so the + // Vulkan backend's deferred-upload path is exercised (regression + // for the post-#12661 fix: upload outside an Ore frame must + // queue and drain at the next beginFrame / beginRenderPass). + // Each upload targets `(mipLevel=0, layer=N)` with a non-tightly- + // packed source (bytesPerRow = 32, double the tight row of 16) + // so the GL backend's GL_UNPACK_ROW_LENGTH plumbing is exercised. uint8_t layerData[kPaddedLayerBytes]; struct LayerColor { @@ -230,6 +259,39 @@ }; for (uint32_t layer = 0; layer < 4; ++layer) { + // Layer 0 as four 2×2 sub-rect tiles. Same solid-red result, but + // exercises every backend's partial-region path. Pre-fix D3D12 + // over-read the whole mip out of each tile buffer. + if (layer == 0) + { + const uint32_t kTileOffsets[4][2] = { + {0, 0}, + {kTileWidth, 0}, + {0, kTileHeight}, + {kTileWidth, kTileHeight}, + }; + uint8_t tileData[kTilePaddedBytes]; + for (const auto& off : kTileOffsets) + { + fillSolidTile(tileData, + kColors[0].r, + kColors[0].g, + kColors[0].b); + TextureDataDesc upload{}; + upload.data = tileData; + upload.bytesPerRow = kTilePaddedRowBytes; + upload.rowsPerImage = kTileHeight; + upload.mipLevel = 0; + upload.layer = 0; + upload.x = off[0]; + upload.y = off[1]; + upload.width = kTileWidth; + upload.height = kTileHeight; + upload.depth = 1; + arrTex->upload(upload); + } + continue; + } fillSolidLayer(layerData, kColors[layer].r, kColors[layer].g, @@ -246,6 +308,8 @@ arrTex->upload(upload); } + m_ore.beginFrame(renderContext); + ColorAttachment ca{}; ca.view = canvasView.get(); ca.loadOp = LoadOp::clear;