.
diff --git a/sparse_strips/vello_hybrid/src/render/webgl.rs b/sparse_strips/vello_hybrid/src/render/webgl.rs
index 217a2f9..6d42b3e 100644
--- a/sparse_strips/vello_hybrid/src/render/webgl.rs
+++ b/sparse_strips/vello_hybrid/src/render/webgl.rs
@@ -1285,9 +1285,15 @@
         gl.clear(WebGl2RenderingContext::COLOR_BUFFER_BIT);
     }
 
-    /// Upload strip data to GPU.
-    fn upload_strips(&mut self, gl: &WebGl2RenderingContext, strips: &[GpuStrip]) {
-        if strips.is_empty() {
+    /// Upload two strip slices (opaque then alpha) into a single GPU buffer,
+    /// avoiding multiple `bufferData` calls.
+    fn upload_strip_pair(
+        &mut self,
+        gl: &WebGl2RenderingContext,
+        first: &[GpuStrip],
+        second: &[GpuStrip],
+    ) {
+        if first.is_empty() && second.is_empty() {
             return;
         }
 
@@ -1295,12 +1301,32 @@
             WebGl2RenderingContext::ARRAY_BUFFER,
             Some(&self.resources.strips_buffer),
         );
-        let strips_data = bytemuck::cast_slice(strips);
-        gl.buffer_data_with_u8_array(
+
+        let first_bytes: &[u8] = bytemuck::cast_slice(first);
+        let second_bytes: &[u8] = bytemuck::cast_slice(second);
+        let total_len = first_bytes.len() + second_bytes.len();
+
+        // Allocate buffer, then write both slices via bufferSubData to avoid
+        // a temporary concatenation Vec.
+        gl.buffer_data_with_i32(
             WebGl2RenderingContext::ARRAY_BUFFER,
-            strips_data,
+            total_len as i32,
             WebGl2RenderingContext::DYNAMIC_DRAW,
         );
+        if !first_bytes.is_empty() {
+            gl.buffer_sub_data_with_i32_and_u8_array(
+                WebGl2RenderingContext::ARRAY_BUFFER,
+                0,
+                first_bytes,
+            );
+        }
+        if !second_bytes.is_empty() {
+            gl.buffer_sub_data_with_i32_and_u8_array(
+                WebGl2RenderingContext::ARRAY_BUFFER,
+                first_bytes.len() as i32,
+                second_bytes,
+            );
+        }
     }
 }
 
@@ -1919,6 +1945,11 @@
     framebuffer
 }
 
+const STRIP_STRIDE: i32 = size_of::<GpuStrip>() as i32;
+const STRIP_ATTR_COUNT: i32 = STRIP_STRIDE / 4;
+const _: () = assert!(STRIP_STRIDE == 24, "expected stride of 24");
+const _: () = assert!(STRIP_ATTR_COUNT == 6);
+
 /// Initialize strip VAO.
 fn initialize_strip_vao(gl: &WebGl2RenderingContext, resources: &WebGlResources) {
     gl.bind_vertex_array(Some(&resources.strip_vao));
@@ -1927,12 +1958,7 @@
         Some(&resources.strips_buffer),
     );
 
-    const STRIDE: i32 = size_of::<GpuStrip>() as i32;
-    const { assert!(STRIDE == 24, "expected stride of 24") };
-    let stride = STRIDE;
-
-    // Configure attributes.
-    for i in 0..6 {
+    for i in 0..STRIP_ATTR_COUNT {
         let location = i as u32;
         let offset = i * 4;
 
@@ -1941,7 +1967,7 @@
             location,
             1,
             WebGl2RenderingContext::UNSIGNED_INT,
-            stride,
+            STRIP_STRIDE,
             offset,
         );
 
@@ -2180,6 +2206,13 @@
         let is_final_view =
             matches!(target, StripPassRenderTarget::Output(OutputTarget::FinalView));
 
+        // Single upload for all strip data (opaque then alpha) to avoid
+        // multiple bufferData calls which are more expensive than attribute rebinding.
+        self.programs
+            .upload_strip_pair(self.gl, opaque_strips, alpha_strips);
+        let opaque_count = opaque_strips.len() as i32;
+        let alpha_count = alpha_strips.len() as i32;
+
         if is_final_view {
             // Clear depth buffer on first use per frame.
             if !self.programs.resources.depth_cleared_this_frame {
@@ -2193,29 +2226,51 @@
             self.gl.depth_func(WebGl2RenderingContext::LEQUAL);
 
             // Opaque pass: front-to-back, depth write ON, blend OFF.
-            if !opaque_strips.is_empty() {
-                self.programs.upload_strips(self.gl, opaque_strips);
+            // Instances 0..opaque_count are already at offset 0 in the buffer.
+            if opaque_count > 0 {
                 self.gl.depth_mask(true);
                 self.gl.disable(WebGl2RenderingContext::BLEND);
                 self.gl.draw_arrays_instanced(
                     WebGl2RenderingContext::TRIANGLE_STRIP,
                     0,
                     4,
-                    opaque_strips.len() as i32,
+                    opaque_count,
                 );
             }
 
             // Alpha pass: back-to-front, depth test ON, depth write OFF, blend ON.
-            if !alpha_strips.is_empty() {
-                self.programs.upload_strips(self.gl, alpha_strips);
+            // Rebind attribute pointers with offset to start at the alpha portion.
+            if alpha_count > 0 {
+                let alpha_byte_offset = opaque_count * STRIP_STRIDE;
+                for i in 0..STRIP_ATTR_COUNT {
+                    self.gl.vertex_attrib_i_pointer_with_i32(
+                        i as u32,
+                        1,
+                        WebGl2RenderingContext::UNSIGNED_INT,
+                        STRIP_STRIDE,
+                        i as i32 * 4 + alpha_byte_offset,
+                    );
+                }
+
                 self.gl.depth_mask(false);
                 self.gl.enable(WebGl2RenderingContext::BLEND);
                 self.gl.draw_arrays_instanced(
                     WebGl2RenderingContext::TRIANGLE_STRIP,
                     0,
                     4,
-                    alpha_strips.len() as i32,
+                    alpha_count,
                 );
+
+                // Restore attribute offsets to base for subsequent passes.
+                for i in 0..STRIP_ATTR_COUNT {
+                    self.gl.vertex_attrib_i_pointer_with_i32(
+                        i as u32,
+                        1,
+                        WebGl2RenderingContext::UNSIGNED_INT,
+                        STRIP_STRIDE,
+                        i as i32 * 4,
+                    );
+                }
             }
 
             // Restore state.
@@ -2224,18 +2279,11 @@
             self.gl.enable(WebGl2RenderingContext::BLEND);
         } else {
             // Slot texture / intermediate: single draw with blending, no depth.
-            // Combine both lists for upload.
-            let all_strips: Vec<GpuStrip> = opaque_strips
-                .iter()
-                .chain(alpha_strips.iter())
-                .copied()
-                .collect();
-            self.programs.upload_strips(self.gl, &all_strips);
             self.gl.draw_arrays_instanced(
                 WebGl2RenderingContext::TRIANGLE_STRIP,
                 0,
                 4,
-                all_strips.len() as i32,
+                opaque_count + alpha_count,
             );
         }
 
diff --git a/sparse_strips/vello_hybrid/src/render/wgpu.rs b/sparse_strips/vello_hybrid/src/render/wgpu.rs
index c7d686f..2523fc4 100644
--- a/sparse_strips/vello_hybrid/src/render/wgpu.rs
+++ b/sparse_strips/vello_hybrid/src/render/wgpu.rs
@@ -2312,19 +2312,28 @@
         }
     }
 
-    /// Upload the strip data by creating and assigning a new `self.resources.strips_buffer`.
-    fn upload_strips(&mut self, device: &Device, queue: &Queue, strips: &[GpuStrip]) {
-        let required_strips_size = size_of_val(strips) as u64;
-        self.resources.strips_buffer = Self::create_strips_buffer(device, required_strips_size);
-        // TODO: Consider using a staging belt to avoid an extra staging buffer allocation.
+    /// Upload two strip slices (opaque then alpha) into a single GPU buffer,
+    /// avoiding an intermediate Vec allocation.
+    fn upload_strip_pair(
+        &mut self,
+        device: &Device,
+        queue: &Queue,
+        first: &[GpuStrip],
+        second: &[GpuStrip],
+    ) {
+        let first_bytes = size_of_val(first) as u64;
+        let second_bytes = size_of_val(second) as u64;
+        let total = first_bytes + second_bytes;
+        self.resources.strips_buffer = Self::create_strips_buffer(device, total);
         let mut buffer = queue
             .write_buffer_with(
                 &self.resources.strips_buffer,
                 0,
-                required_strips_size.try_into().unwrap(),
+                total.try_into().unwrap(),
             )
             .expect("Capacity handled in creation");
-        buffer.copy_from_slice(bytemuck::cast_slice(strips));
+        buffer[..first_bytes as usize].copy_from_slice(bytemuck::cast_slice(first));
+        buffer[first_bytes as usize..].copy_from_slice(bytemuck::cast_slice(second));
     }
 }
 
@@ -2353,11 +2362,8 @@
         if opaque_strips.is_empty() && alpha_strips.is_empty() {
             return;
         }
-        // Upload all strips (opaque first, then alpha) into a single buffer.
-        let total_strips: Vec<GpuStrip> =
-            opaque_strips.iter().chain(alpha_strips.iter()).copied().collect();
         self.programs
-            .upload_strips(self.device, self.queue, &total_strips);
+            .upload_strip_pair(self.device, self.queue, opaque_strips, alpha_strips);
         let opaque_count = u32::try_from(opaque_strips.len()).unwrap();
         let alpha_count = u32::try_from(alpha_strips.len()).unwrap();
 
diff --git a/sparse_strips/vello_hybrid/src/schedule.rs b/sparse_strips/vello_hybrid/src/schedule.rs
index a7805c1..614e945 100644
--- a/sparse_strips/vello_hybrid/src/schedule.rs
+++ b/sparse_strips/vello_hybrid/src/schedule.rs
@@ -1007,8 +1007,11 @@
             let layer_index = self.next_layer_index();
             let draw = self.draw_mut(self.round, 2);
             draw.push_opaque(
-                GpuStripBuilder::at_surface(wide_tile_x, wide_tile_y, WideTile::WIDTH)
-                    .paint(payload, paint, layer_index),
+                GpuStripBuilder::at_surface(wide_tile_x, wide_tile_y, WideTile::WIDTH).paint(
+                    payload,
+                    paint,
+                    layer_index,
+                ),
             );
         }
     }
@@ -1413,13 +1416,11 @@
             // `BlendState::PREMULTIPLIED_ALPHA_BLENDING`). This is the whole reason
             // why for default blend modes, we don't need to rely on temporary slots
             // to achieve blending.
-            draw.push_alpha(
-                gpu_strip_builder.copy_from_slot(
-                    tos.dest_slot.get_idx(),
-                    (tos.opacity * 255.0) as u8,
-                    layer_index,
-                ),
-            );
+            draw.push_alpha(gpu_strip_builder.copy_from_slot(
+                tos.dest_slot.get_idx(),
+                (tos.opacity * 255.0) as u8,
+                layer_index,
+            ));
         }
     }
 
@@ -1465,11 +1466,11 @@
         };
 
         let draw = self.draw_mut(el_round, draw_texture);
-        draw.push_alpha(
-            gpu_strip_builder
-                .with_sparse(cmd.width, col_idx)
-                .paint(payload, paint, layer_index),
-        );
+        draw.push_alpha(gpu_strip_builder.with_sparse(cmd.width, col_idx).paint(
+            payload,
+            paint,
+            layer_index,
+        ));
     }
 
     #[inline]
@@ -1553,6 +1554,7 @@
                             && img.sampler.alpha == 1.0
                             && img.tint.is_none_or(|t| t.color.components[3] >= 1.0)
                     }
+                    Some(EncodedPaint::Gradient(g)) => !g.may_have_opacities,
                     _ => false,
                 }
             }
@@ -1616,9 +1618,11 @@
         } else {
             GpuStripBuilder::at_slot(nos.dest_slot.get_idx(), cmd.x, cmd.width)
         };
-        draw.push_alpha(
-            gpu_strip_builder.copy_from_slot(tos.dest_slot.get_idx(), 0xFF, layer_index),
-        );
+        draw.push_alpha(gpu_strip_builder.copy_from_slot(
+            tos.dest_slot.get_idx(),
+            0xFF,
+            layer_index,
+        ));
 
         let nos_ptr = state.tile_state.stack.len() - 2;
         state.tile_state.stack[nos_ptr].temporary_slot.invalidate();
@@ -2014,9 +2018,7 @@
     // Edge strips: 1px wide/tall strips carrying fractional coverage.
     // Top edge (full width, 1px tall).
     if has_top {
-        let frac = u32::from(frac_l)
-            | (u32::from(frac_t) << 8)
-            | (u32::from(frac_r) << 16);
+        let frac = u32::from(frac_l) | (u32::from(frac_t) << 8) | (u32::from(frac_r) << 16);
         let (payload, paint_packed) =
             Scheduler::process_paint(&rect.paint, encoded_paints, (base_x, base_y), paint_idxs);
         draw.push_alpha(GpuStrip {
@@ -2034,15 +2036,9 @@
     // Bottom edge (full width, 1px tall).
     if has_bottom {
         let bottom_y = base_y + snapped_h - 1;
-        let frac = u32::from(frac_l)
-            | (u32::from(frac_r) << 16)
-            | (u32::from(frac_b) << 24);
-        let (payload, paint_packed) = Scheduler::process_paint(
-            &rect.paint,
-            encoded_paints,
-            (base_x, bottom_y),
-            paint_idxs,
-        );
+        let frac = u32::from(frac_l) | (u32::from(frac_r) << 16) | (u32::from(frac_b) << 24);
+        let (payload, paint_packed) =
+            Scheduler::process_paint(&rect.paint, encoded_paints, (base_x, bottom_y), paint_idxs);
         draw.push_alpha(GpuStrip {
             x: base_x,
             y: bottom_y,