.
diff --git a/sparse_strips/vello_hybrid/src/render/webgl.rs b/sparse_strips/vello_hybrid/src/render/webgl.rs index 217a2f9..6d42b3e 100644 --- a/sparse_strips/vello_hybrid/src/render/webgl.rs +++ b/sparse_strips/vello_hybrid/src/render/webgl.rs
@@ -1285,9 +1285,15 @@ gl.clear(WebGl2RenderingContext::COLOR_BUFFER_BIT); } - /// Upload strip data to GPU. - fn upload_strips(&mut self, gl: &WebGl2RenderingContext, strips: &[GpuStrip]) { - if strips.is_empty() { + /// Upload two strip slices (opaque then alpha) into a single GPU buffer, + /// avoiding multiple `bufferData` calls. + fn upload_strip_pair( + &mut self, + gl: &WebGl2RenderingContext, + first: &[GpuStrip], + second: &[GpuStrip], + ) { + if first.is_empty() && second.is_empty() { return; } @@ -1295,12 +1301,32 @@ WebGl2RenderingContext::ARRAY_BUFFER, Some(&self.resources.strips_buffer), ); - let strips_data = bytemuck::cast_slice(strips); - gl.buffer_data_with_u8_array( + + let first_bytes: &[u8] = bytemuck::cast_slice(first); + let second_bytes: &[u8] = bytemuck::cast_slice(second); + let total_len = first_bytes.len() + second_bytes.len(); + + // Allocate buffer, then write both slices via bufferSubData to avoid + // a temporary concatenation Vec. + gl.buffer_data_with_i32( WebGl2RenderingContext::ARRAY_BUFFER, - strips_data, + total_len as i32, WebGl2RenderingContext::DYNAMIC_DRAW, ); + if !first_bytes.is_empty() { + gl.buffer_sub_data_with_i32_and_u8_array( + WebGl2RenderingContext::ARRAY_BUFFER, + 0, + first_bytes, + ); + } + if !second_bytes.is_empty() { + gl.buffer_sub_data_with_i32_and_u8_array( + WebGl2RenderingContext::ARRAY_BUFFER, + first_bytes.len() as i32, + second_bytes, + ); + } } } @@ -1919,6 +1945,11 @@ framebuffer } +const STRIP_STRIDE: i32 = size_of::<GpuStrip>() as i32; +const STRIP_ATTR_COUNT: i32 = STRIP_STRIDE / 4; +const _: () = assert!(STRIP_STRIDE == 24, "expected stride of 24"); +const _: () = assert!(STRIP_ATTR_COUNT == 6); + /// Initialize strip VAO. fn initialize_strip_vao(gl: &WebGl2RenderingContext, resources: &WebGlResources) { gl.bind_vertex_array(Some(&resources.strip_vao)); @@ -1927,12 +1958,7 @@ Some(&resources.strips_buffer), ); - const STRIDE: i32 = size_of::<GpuStrip>() as i32; - const { assert!(STRIDE == 24, "expected stride of 24") }; - let stride = STRIDE; - - // Configure attributes. - for i in 0..6 { + for i in 0..STRIP_ATTR_COUNT { let location = i as u32; let offset = i * 4; @@ -1941,7 +1967,7 @@ location, 1, WebGl2RenderingContext::UNSIGNED_INT, - stride, + STRIP_STRIDE, offset, ); @@ -2180,6 +2206,13 @@ let is_final_view = matches!(target, StripPassRenderTarget::Output(OutputTarget::FinalView)); + // Single upload for all strip data (opaque then alpha) to avoid + // multiple bufferData calls which are more expensive than attribute rebinding. + self.programs + .upload_strip_pair(self.gl, opaque_strips, alpha_strips); + let opaque_count = opaque_strips.len() as i32; + let alpha_count = alpha_strips.len() as i32; + if is_final_view { // Clear depth buffer on first use per frame. if !self.programs.resources.depth_cleared_this_frame { @@ -2193,29 +2226,51 @@ self.gl.depth_func(WebGl2RenderingContext::LEQUAL); // Opaque pass: front-to-back, depth write ON, blend OFF. - if !opaque_strips.is_empty() { - self.programs.upload_strips(self.gl, opaque_strips); + // Instances 0..opaque_count are already at offset 0 in the buffer. + if opaque_count > 0 { self.gl.depth_mask(true); self.gl.disable(WebGl2RenderingContext::BLEND); self.gl.draw_arrays_instanced( WebGl2RenderingContext::TRIANGLE_STRIP, 0, 4, - opaque_strips.len() as i32, + opaque_count, ); } // Alpha pass: back-to-front, depth test ON, depth write OFF, blend ON. - if !alpha_strips.is_empty() { - self.programs.upload_strips(self.gl, alpha_strips); + // Rebind attribute pointers with offset to start at the alpha portion. + if alpha_count > 0 { + let alpha_byte_offset = opaque_count * STRIP_STRIDE; + for i in 0..STRIP_ATTR_COUNT { + self.gl.vertex_attrib_i_pointer_with_i32( + i as u32, + 1, + WebGl2RenderingContext::UNSIGNED_INT, + STRIP_STRIDE, + i as i32 * 4 + alpha_byte_offset, + ); + } + self.gl.depth_mask(false); self.gl.enable(WebGl2RenderingContext::BLEND); self.gl.draw_arrays_instanced( WebGl2RenderingContext::TRIANGLE_STRIP, 0, 4, - alpha_strips.len() as i32, + alpha_count, ); + + // Restore attribute offsets to base for subsequent passes. + for i in 0..STRIP_ATTR_COUNT { + self.gl.vertex_attrib_i_pointer_with_i32( + i as u32, + 1, + WebGl2RenderingContext::UNSIGNED_INT, + STRIP_STRIDE, + i as i32 * 4, + ); + } } // Restore state. @@ -2224,18 +2279,11 @@ self.gl.enable(WebGl2RenderingContext::BLEND); } else { // Slot texture / intermediate: single draw with blending, no depth. - // Combine both lists for upload. - let all_strips: Vec<GpuStrip> = opaque_strips - .iter() - .chain(alpha_strips.iter()) - .copied() - .collect(); - self.programs.upload_strips(self.gl, &all_strips); self.gl.draw_arrays_instanced( WebGl2RenderingContext::TRIANGLE_STRIP, 0, 4, - all_strips.len() as i32, + opaque_count + alpha_count, ); }
diff --git a/sparse_strips/vello_hybrid/src/render/wgpu.rs b/sparse_strips/vello_hybrid/src/render/wgpu.rs index c7d686f..2523fc4 100644 --- a/sparse_strips/vello_hybrid/src/render/wgpu.rs +++ b/sparse_strips/vello_hybrid/src/render/wgpu.rs
@@ -2312,19 +2312,28 @@ } } - /// Upload the strip data by creating and assigning a new `self.resources.strips_buffer`. - fn upload_strips(&mut self, device: &Device, queue: &Queue, strips: &[GpuStrip]) { - let required_strips_size = size_of_val(strips) as u64; - self.resources.strips_buffer = Self::create_strips_buffer(device, required_strips_size); - // TODO: Consider using a staging belt to avoid an extra staging buffer allocation. + /// Upload two strip slices (opaque then alpha) into a single GPU buffer, + /// avoiding an intermediate Vec allocation. + fn upload_strip_pair( + &mut self, + device: &Device, + queue: &Queue, + first: &[GpuStrip], + second: &[GpuStrip], + ) { + let first_bytes = size_of_val(first) as u64; + let second_bytes = size_of_val(second) as u64; + let total = first_bytes + second_bytes; + self.resources.strips_buffer = Self::create_strips_buffer(device, total); let mut buffer = queue .write_buffer_with( &self.resources.strips_buffer, 0, - required_strips_size.try_into().unwrap(), + total.try_into().unwrap(), ) .expect("Capacity handled in creation"); - buffer.copy_from_slice(bytemuck::cast_slice(strips)); + buffer[..first_bytes as usize].copy_from_slice(bytemuck::cast_slice(first)); + buffer[first_bytes as usize..].copy_from_slice(bytemuck::cast_slice(second)); } } @@ -2353,11 +2362,8 @@ if opaque_strips.is_empty() && alpha_strips.is_empty() { return; } - // Upload all strips (opaque first, then alpha) into a single buffer. - let total_strips: Vec<GpuStrip> = - opaque_strips.iter().chain(alpha_strips.iter()).copied().collect(); self.programs - .upload_strips(self.device, self.queue, &total_strips); + .upload_strip_pair(self.device, self.queue, opaque_strips, alpha_strips); let opaque_count = u32::try_from(opaque_strips.len()).unwrap(); let alpha_count = u32::try_from(alpha_strips.len()).unwrap();
diff --git a/sparse_strips/vello_hybrid/src/schedule.rs b/sparse_strips/vello_hybrid/src/schedule.rs index a7805c1..614e945 100644 --- a/sparse_strips/vello_hybrid/src/schedule.rs +++ b/sparse_strips/vello_hybrid/src/schedule.rs
@@ -1007,8 +1007,11 @@ let layer_index = self.next_layer_index(); let draw = self.draw_mut(self.round, 2); draw.push_opaque( - GpuStripBuilder::at_surface(wide_tile_x, wide_tile_y, WideTile::WIDTH) - .paint(payload, paint, layer_index), + GpuStripBuilder::at_surface(wide_tile_x, wide_tile_y, WideTile::WIDTH).paint( + payload, + paint, + layer_index, + ), ); } } @@ -1413,13 +1416,11 @@ // `BlendState::PREMULTIPLIED_ALPHA_BLENDING`). This is the whole reason // why for default blend modes, we don't need to rely on temporary slots // to achieve blending. - draw.push_alpha( - gpu_strip_builder.copy_from_slot( - tos.dest_slot.get_idx(), - (tos.opacity * 255.0) as u8, - layer_index, - ), - ); + draw.push_alpha(gpu_strip_builder.copy_from_slot( + tos.dest_slot.get_idx(), + (tos.opacity * 255.0) as u8, + layer_index, + )); } } @@ -1465,11 +1466,11 @@ }; let draw = self.draw_mut(el_round, draw_texture); - draw.push_alpha( - gpu_strip_builder - .with_sparse(cmd.width, col_idx) - .paint(payload, paint, layer_index), - ); + draw.push_alpha(gpu_strip_builder.with_sparse(cmd.width, col_idx).paint( + payload, + paint, + layer_index, + )); } #[inline] @@ -1553,6 +1554,7 @@ && img.sampler.alpha == 1.0 && img.tint.is_none_or(|t| t.color.components[3] >= 1.0) } + Some(EncodedPaint::Gradient(g)) => !g.may_have_opacities, _ => false, } } @@ -1616,9 +1618,11 @@ } else { GpuStripBuilder::at_slot(nos.dest_slot.get_idx(), cmd.x, cmd.width) }; - draw.push_alpha( - gpu_strip_builder.copy_from_slot(tos.dest_slot.get_idx(), 0xFF, layer_index), - ); + draw.push_alpha(gpu_strip_builder.copy_from_slot( + tos.dest_slot.get_idx(), + 0xFF, + layer_index, + )); let nos_ptr = state.tile_state.stack.len() - 2; state.tile_state.stack[nos_ptr].temporary_slot.invalidate(); @@ -2014,9 +2018,7 @@ // Edge strips: 1px wide/tall strips carrying fractional coverage. // Top edge (full width, 1px tall). if has_top { - let frac = u32::from(frac_l) - | (u32::from(frac_t) << 8) - | (u32::from(frac_r) << 16); + let frac = u32::from(frac_l) | (u32::from(frac_t) << 8) | (u32::from(frac_r) << 16); let (payload, paint_packed) = Scheduler::process_paint(&rect.paint, encoded_paints, (base_x, base_y), paint_idxs); draw.push_alpha(GpuStrip { @@ -2034,15 +2036,9 @@ // Bottom edge (full width, 1px tall). if has_bottom { let bottom_y = base_y + snapped_h - 1; - let frac = u32::from(frac_l) - | (u32::from(frac_r) << 16) - | (u32::from(frac_b) << 24); - let (payload, paint_packed) = Scheduler::process_paint( - &rect.paint, - encoded_paints, - (base_x, bottom_y), - paint_idxs, - ); + let frac = u32::from(frac_l) | (u32::from(frac_r) << 16) | (u32::from(frac_b) << 24); + let (payload, paint_packed) = + Scheduler::process_paint(&rect.paint, encoded_paints, (base_x, bottom_y), paint_idxs); draw.push_alpha(GpuStrip { x: base_x, y: bottom_y,