Implement basic clip logic Adds a clip method to the (CPU) render context, plus a considerable amount of mechanism in coarse and fine rasterization to support clipping. The coarse rasterization logic contains a similar set of optimizations as Vello. In particular, all-zero tiles have drawing suppressed, and all-one tiles pass drawing commands through with no additional work to clip. Not extensively validated, but it does render a simple scene with clipping correctly.
diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs index eeb6a1b..433823f 100644 --- a/vello_hybrid/examples/gpu.rs +++ b/vello_hybrid/examples/gpu.rs
@@ -61,7 +61,7 @@ surface.configure(&device, &sc); let session = GpuSession::new(&device, format); - // TODO: actually render something + let mut render_ctx = GpuRenderCtx::new(size.width as usize, size.height as usize); draw_simple_scene(&mut render_ctx); let bufs = render_ctx.harvest();
diff --git a/vello_hybrid/examples/simple.rs b/vello_hybrid/examples/simple.rs index e91f017..bae2c10 100644 --- a/vello_hybrid/examples/simple.rs +++ b/vello_hybrid/examples/simple.rs
@@ -7,24 +7,16 @@ use std::io::BufWriter; use vello_api::peniko::color::palette; -use vello_api::peniko::kurbo::{BezPath, Stroke}; +use vello_api::peniko::kurbo::{BezPath, Point, Stroke, Vec2}; use vello_api::RenderCtx; use vello_hybrid::{CsRenderCtx, Pixmap}; const WIDTH: usize = 1024; -const HEIGHT: usize = 256; +const HEIGHT: usize = 1024; pub fn main() { let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT); - let mut path = BezPath::new(); - path.move_to((10.0, 10.0)); - path.line_to((180.0, 20.0)); - path.line_to((30.0, 40.0)); - path.close_path(); - let piet_path = path.into(); - ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into()); - let stroke = Stroke::new(5.0); - ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into()); + draw_simple_scene(&mut ctx); if let Some(filename) = std::env::args().nth(1) { let mut pixmap = Pixmap::new(WIDTH, HEIGHT); ctx.render_to_pixmap(&mut pixmap); @@ -39,3 +31,30 @@ ctx.debug_dump(); } } + +fn star(center: Point, n: usize, inner: f64, outer: f64) -> BezPath { + let mut path = BezPath::new(); + path.move_to(center + Vec2::new(outer, 0.)); + for i in 1..n * 2 { + let th = i as f64 * std::f64::consts::PI / n as f64; + let r = if i % 2 == 0 { outer } else { inner }; + path.line_to(center + r * Vec2::from_angle(th)); + } + path.close_path(); + path +} + +fn draw_simple_scene(ctx: &mut CsRenderCtx) { + let mut path = BezPath::new(); + path.move_to((10.0, 10.0)); + path.line_to((180.0, 20.0)); + path.line_to((30.0, 180.0)); + path.close_path(); + // Note: we plan to change the API to have `into`. + let piet_path = path.into(); + let stroke = Stroke::new(5.0); + ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into()); + let star_path = star(Point::new(100., 100.), 13, 50., 95.); + ctx.clip(&star_path.into()); + ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into()); +}
diff --git a/vello_hybrid/src/fine.rs b/vello_hybrid/src/fine.rs index 4b93552..52b94b0 100644 --- a/vello_hybrid/src/fine.rs +++ b/vello_hybrid/src/fine.rs
@@ -15,7 +15,7 @@ // f32 RGBA pixels // That said, if we use u8, then this is basically a block of // untyped memory. - pub(crate) scratch: [f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4], + pub(crate) scratch: Vec<[f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]>, #[allow(clippy::doc_markdown, reason = "false positive for x86_64")] /// Whether to use SIMD /// @@ -31,7 +31,7 @@ impl<'a> Fine<'a> { pub(crate) fn new(width: usize, height: usize, out_buf: &'a mut [u8]) -> Self { - let scratch = [0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]; + let scratch = vec![[0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]]; Self { width, height, @@ -42,7 +42,8 @@ } pub(crate) fn clear_scalar(&mut self, color: [f32; 4]) { - for z in self.scratch.chunks_exact_mut(4) { + let scratch = self.scratch.last_mut().unwrap(); + for z in scratch.chunks_exact_mut(4) { z.copy_from_slice(&color); } } @@ -58,12 +59,13 @@ (y + 1) * STRIP_HEIGHT <= self.height, "overflow of pixmap height" ); + let scratch = self.scratch.last_mut().unwrap(); let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4; for j in 0..STRIP_HEIGHT { let line_ix = base_ix + j * self.width * 4; for i in 0..WIDE_TILE_WIDTH { let mut rgba_f32 = [0.0; 4]; - rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]); + rgba_f32.copy_from_slice(&scratch[(i * STRIP_HEIGHT + j) * 4..][..4]); let rgba_u8 = rgba_f32.map(|z| (z * 255.0).round() as u8); self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8); } @@ -79,20 +81,28 @@ let aslice = &alphas[s.alpha_ix..]; self.strip(s.x as usize, s.width as usize, aslice, s.color.components); } + Cmd::PushClip => self.scratch.push([0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]), + Cmd::PopClip => _ = self.scratch.pop(), + Cmd::ClipFill(f) => { + self.clip_fill_scalar(f.x as usize, f.width as usize); + } + Cmd::ClipStrip(s) => { + let aslice = &alphas[s.alpha_ix..]; + self.clip_strip_scalar(s.x as usize, s.width as usize, aslice); + } } } pub(crate) fn fill_scalar(&mut self, x: usize, width: usize, color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); if color[3] == 1.0 { - for z in - self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) + for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) { z.copy_from_slice(&color); } } else { let one_minus_alpha = 1.0 - color[3]; - for z in - self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) + for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) { for i in 0..4 { //z[i] = color[i] + one_minus_alpha * z[i]; @@ -106,9 +116,10 @@ } pub(crate) fn strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); debug_assert!(alphas.len() >= width, "overflow of alphas buffer"); let cs = color.map(|z| z * (1.0 / 255.0)); - for (z, a) in self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width] + for (z, a) in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width] .chunks_exact_mut(16) .zip(alphas) { @@ -121,4 +132,33 @@ } } } + + fn clip_fill_scalar(&mut self, x: usize, width: usize) { + let (tos, rest) = self.scratch.split_last_mut().unwrap(); + let nos = rest.last_mut().unwrap(); + for i in 0..width { + for j in 0..4 { + let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4; + let one_minus_alpha = 1.0 - tos[ix + 3]; + for k in 0..4 { + nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, tos[ix + k]); + } + } + } + } + + fn clip_strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32]) { + let (tos, rest) = self.scratch.split_last_mut().unwrap(); + let nos = rest.last_mut().unwrap(); + for (i, a) in alphas.iter().take(width).enumerate() { + for j in 0..4 { + let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4; + let mask_alpha = ((a >> (j * 8)) & 0xff) as f32 * (1. / 255.); + let one_minus_alpha = 1.0 - mask_alpha * tos[ix + 3]; + for k in 0..4 { + nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, mask_alpha * tos[ix + k]); + } + } + } + } }
diff --git a/vello_hybrid/src/gpu.rs b/vello_hybrid/src/gpu.rs index b4ed688..c5a1b65 100644 --- a/vello_hybrid/src/gpu.rs +++ b/vello_hybrid/src/gpu.rs
@@ -139,7 +139,8 @@ } } - pub fn harvest(&self) -> GpuRenderBufs { + pub fn harvest(&mut self) -> GpuRenderBufs { + self.inner.finish(); let mut strips = Vec::new(); let width_tiles = (self.inner.width).div_ceil(WIDE_TILE_WIDTH); let height_tiles = (self.inner.height).div_ceil(STRIP_HEIGHT); @@ -184,6 +185,7 @@ }; strips.push(strip); } + _ => todo!(), } } }
diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs index 1c7bfa6..7ecb8b2 100644 --- a/vello_hybrid/src/render.rs +++ b/vello_hybrid/src/render.rs
@@ -19,7 +19,7 @@ fine::Fine, strip::{self, Strip, Tile}, tiling::{self, FlatLine}, - wide_tile::{Cmd, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH}, + wide_tile::{Cmd, CmdClipStrip, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH}, Pixmap, }; @@ -34,6 +34,22 @@ line_buf: Vec<FlatLine>, tile_buf: Vec<Tile>, strip_buf: Vec<Strip>, + + state_stack: Vec<GfxState>, + clip_stack: Vec<Clip>, +} + +struct GfxState { + // TODO: transform goes here (there's logic in piet-ts to copy) + n_clip: usize, +} + +struct Clip { + // should probably be a bounding box type + /// The intersected bounding box after clip + clip_bbox: [usize; 4], + /// The rendered path in sparse strip representation + strips: Vec<Strip>, } pub struct CsResourceCtx; @@ -45,18 +61,17 @@ let tiles = (0..width_tiles * height_tiles) .map(|_| WideTile::default()) .collect(); - let alphas = vec![]; - let line_buf = vec![]; - let tile_buf = vec![]; - let strip_buf = vec![]; + let state = GfxState { n_clip: 0 }; Self { width, height, tiles, - alphas, - line_buf, - tile_buf, - strip_buf, + alphas: vec![], + line_buf: vec![], + tile_buf: vec![], + strip_buf: vec![], + state_stack: vec![state], + clip_stack: vec![], } } @@ -67,7 +82,16 @@ } } - pub fn render_to_pixmap(&self, pixmap: &mut Pixmap) { + /// Finish the coarse rasterization prior to fine rendering. + /// + /// At the moment, this mostly involves resolving any open clips, but + /// might extend to other things. + pub(crate) fn finish(&mut self) { + self.pop_clips(); + } + + pub fn render_to_pixmap(&mut self, pixmap: &mut Pixmap) { + self.finish(); let mut fine = Fine::new(pixmap.width, pixmap.height, &mut pixmap.buf); let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); let height_tiles = (self.height).div_ceil(STRIP_HEIGHT); @@ -94,27 +118,43 @@ println!("total = {total}, {histo:?}"); } + /// Render a path to the strip buffer. + fn render_path_common(&mut self) { + tiling::make_tiles(&self.line_buf, &mut self.tile_buf); + self.tile_buf.sort_unstable_by(Tile::cmp); + crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas); + } + /// Render a path, which has already been flattened into `line_buf`. fn render_path(&mut self, brush: BrushRef<'_>) { // TODO: need to make sure tiles contained in viewport - we'll likely // panic otherwise. - tiling::make_tiles(&self.line_buf, &mut self.tile_buf); - self.tile_buf.sort_unstable_by(Tile::cmp); - crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas); + self.render_path_common(); let color = brush_to_color(brush); let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH); + let bbox = self.get_bbox(); for i in 0..self.strip_buf.len() - 1 { let strip = &self.strip_buf[i]; let next_strip = &self.strip_buf[i + 1]; let x0 = strip.x(); - let y = strip.strip_y(); - let row_start = y as usize * width_tiles; + let y = strip.strip_y() as usize; + if y < bbox[1] { + continue; + } + if y >= bbox[3] { + break; + } + let row_start = y * width_tiles; let strip_width = next_strip.col - strip.col; let x1 = x0 + strip_width; - let xtile0 = x0 as usize / WIDE_TILE_WIDTH; - let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH); + let xtile0 = (x0 as usize / WIDE_TILE_WIDTH).max(bbox[0]); + let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]); let mut x = x0; let mut col = strip.col; + if (bbox[0] * WIDE_TILE_WIDTH) as u32 > x { + col += (bbox[0] * WIDE_TILE_WIDTH) as u32 - x; + x = (bbox[0] * WIDE_TILE_WIDTH) as u32; + } for xtile in xtile0..xtile1 { let x_tile_rel = x % WIDE_TILE_WIDTH as u32; let width = x1.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x; @@ -126,13 +166,13 @@ }; x += width; col += width; - self.tiles[row_start + xtile].push(Cmd::Strip(cmd)); + self.tiles[row_start + xtile].strip(cmd); } - if next_strip.winding != 0 && y == next_strip.strip_y() { + if next_strip.winding != 0 && y == next_strip.strip_y() as usize { x = x1; let x2 = next_strip.x(); - let fxt0 = x1 as usize / WIDE_TILE_WIDTH; - let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH); + let fxt0 = (x1 as usize / WIDE_TILE_WIDTH).max(bbox[0]); + let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]); for xtile in fxt0..fxt1 { let x_tile_rel = x % WIDE_TILE_WIDTH as u32; let width = x2.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x; @@ -161,6 +201,133 @@ // TODO: get from graphics state Affine::scale(5.0) } + + fn get_bbox(&self) -> [usize; 4] { + if let Some(tos) = self.clip_stack.last() { + tos.clip_bbox + } else { + let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); + let height_tiles = (self.height).div_ceil(STRIP_HEIGHT); + [0, 0, width_tiles, height_tiles] + } + } + + fn pop_clip(&mut self) { + self.state_stack.last_mut().unwrap().n_clip -= 1; + let Clip { clip_bbox, strips } = self.clip_stack.pop().unwrap(); + let n_strips = strips.len(); + // The next bit of code accomplishes the following. For each tile in + // the intersected bounding box, it does one of three things depending + // on the contents of the clip path in that tile. + // If all-zero: pop a zero_clip. + // If all-one: do nothing. + // If contains one or more strips: render strips and fills, then pop a clip. + // This logic is the inverse of the push logic in `clip()`, and the stack + // should be balanced after running both. + let mut tile_x = clip_bbox[0]; + let mut tile_y = clip_bbox[1]; + let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); + let mut pop_pending = false; + for i in 0..n_strips - 1 { + let strip = &strips[i]; + let y = strip.strip_y() as usize; + if y < tile_y { + continue; + } + while tile_y < y.min(clip_bbox[3]) { + if core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + tile_x += 1; + } + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].pop_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + if tile_y == clip_bbox[3] { + break; + } + let x0 = strip.x() as usize; + let x_clamped = (x0 / WIDE_TILE_WIDTH).min(clip_bbox[2]); + if tile_x < x_clamped { + if core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + tile_x += 1; + } + // The winding check is probably not needed; if there was a fill, + // the logic below should have advanced tile_x. + if strip.winding == 0 { + for x in tile_x..x_clamped { + self.tiles[tile_y * width_tiles + x].pop_zero_clip(); + } + } + tile_x = x_clamped; + } + let next_strip = &strips[i + 1]; + let strip_width = (next_strip.col - strip.col) as usize; + let x1 = x0 + strip_width; + let xtile0 = (x0 / WIDE_TILE_WIDTH).max(clip_bbox[0]); + let xtile1 = x1.div_ceil(WIDE_TILE_WIDTH).min(clip_bbox[2]); + let mut x = x0; + let mut alpha_ix = strip.col as usize; + if clip_bbox[0] * WIDE_TILE_WIDTH > x { + alpha_ix += clip_bbox[0] * WIDE_TILE_WIDTH - x; + x = clip_bbox[0] * WIDE_TILE_WIDTH; + } + for xtile in xtile0..xtile1 { + if xtile > tile_x && core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + } + let x_tile_rel = (x % WIDE_TILE_WIDTH) as u32; + let width = x1.min((xtile + 1) * WIDE_TILE_WIDTH) - x; + let cmd = CmdClipStrip { + x: x_tile_rel, + width: width as u32, + alpha_ix, + }; + x += width; + alpha_ix += width; + self.tiles[tile_y * width_tiles + xtile].clip_strip(cmd); + tile_x = xtile; + pop_pending = true; + } + if next_strip.winding != 0 && y == next_strip.strip_y() as usize { + let x2 = next_strip.x() as usize; + let tile_x2 = x2.min((tile_x + 1) * WIDE_TILE_WIDTH); + let width = tile_x2 - x1; + if width > 0 { + let x_tile_rel = (x1 % WIDE_TILE_WIDTH) as u32; + self.tiles[tile_y * width_tiles + tile_x].clip_fill(x_tile_rel, width as u32); + } + if x2 > (tile_x + 1) * WIDE_TILE_WIDTH { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + let width2 = x2 % WIDE_TILE_WIDTH; + tile_x = x2 / WIDE_TILE_WIDTH; + if width2 > 0 { + self.tiles[tile_y * width_tiles + tile_x].clip_fill(0, width2 as u32); + } + } + } + } + if core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + tile_x += 1; + } + while tile_y < clip_bbox[3] { + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].pop_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + } + + fn pop_clips(&mut self) { + while self.state_stack.last().unwrap().n_clip > 0 { + self.pop_clip(); + } + } } impl RenderCtx for CsRenderCtx { @@ -200,15 +367,104 @@ } fn clip(&mut self, path: &vello_api::Path) { - todo!() + let affine = self.get_affine(); + crate::flatten::fill(&path.path, affine, &mut self.line_buf); + self.render_path_common(); + let strips = core::mem::take(&mut self.strip_buf); + let n_strips = strips.len(); + let path_bbox = if n_strips <= 1 { + [0, 0, 0, 0] + } else { + let y0 = strips[0].strip_y() as usize; + let y1 = strips[n_strips - 1].strip_y() as usize + 1; + let mut x0 = strips[0].x() as usize / WIDE_TILE_WIDTH; + let mut x1 = x0; + for i in 0..n_strips - 1 { + let strip = &strips[i]; + let next_strip = &strips[i + 1]; + let width = next_strip.col - strip.col; + let x = strip.x() as usize; + x0 = x0.min(x / WIDE_TILE_WIDTH); + x1 = x1.max((x + width as usize).div_ceil(WIDE_TILE_WIDTH)); + } + [x0, x1, y0, y1] + }; + let parent_bbox = self.get_bbox(); + // intersect clip bounding box + let clip_bbox = [ + parent_bbox[0].max(path_bbox[0]), + parent_bbox[1].max(path_bbox[1]), + parent_bbox[2].min(path_bbox[2]), + parent_bbox[3].min(path_bbox[3]), + ]; + // The next bit of code accomplishes the following. For each tile in + // the intersected bounding box, it does one of three things depending + // on the contents of the clip path in that tile. + // If all-zero: push a zero_clip + // If all-one: do nothing + // If contains one or more strips: push a clip + let mut tile_x = clip_bbox[0]; + let mut tile_y = clip_bbox[1]; + let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); + for i in 0..n_strips - 1 { + let strip = &strips[i]; + let y = strip.strip_y() as usize; + if y < tile_y { + continue; + } + while tile_y < y.min(clip_bbox[3]) { + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].push_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + if tile_y == clip_bbox[3] { + break; + } + let x_pixels = strip.x() as usize; + let x_clamped = (x_pixels / WIDE_TILE_WIDTH).min(clip_bbox[2]); + if tile_x < x_clamped { + if strip.winding == 0 { + for x in tile_x..x_clamped { + self.tiles[tile_y * width_tiles + x].push_zero_clip(); + } + } + // If winding is nonzero, then wide tiles covered entirely + // by sparse fill are no-op (no clipping is applied). + tile_x = x_clamped; + } + let next_strip = &strips[i + 1]; + let width = (next_strip.col - strip.col) as usize; + let x1 = (x_pixels + width) + .div_ceil(WIDE_TILE_WIDTH) + .min(clip_bbox[2]); + if tile_x < x1 { + for x in tile_x..x1 { + self.tiles[tile_y * width_tiles + x].push_clip(); + } + tile_x = x1; + } + } + while tile_y < clip_bbox[3] { + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].push_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + let clip = Clip { clip_bbox, strips }; + self.clip_stack.push(clip); + self.state_stack.last_mut().unwrap().n_clip += 1; } fn save(&mut self) { - todo!() + self.state_stack.push(GfxState { n_clip: 0 }); } fn restore(&mut self) { - todo!() + self.pop_clips(); + self.state_stack.pop(); } fn transform(&mut self, affine: vello_api::peniko::kurbo::Affine) {
diff --git a/vello_hybrid/src/simd/neon.rs b/vello_hybrid/src/simd/neon.rs index 72bf920..777b809 100644 --- a/vello_hybrid/src/simd/neon.rs +++ b/vello_hybrid/src/simd/neon.rs
@@ -14,16 +14,18 @@ impl Fine<'_> { pub(crate) unsafe fn clear_simd(&mut self, color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); unsafe { let v_color = vld1q_f32(color.as_ptr()); let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color); for i in 0..WIDE_TILE_WIDTH { - vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4); + vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4); } } } pub(crate) fn pack_simd(&mut self, x: usize, y: usize) { + let scratch = self.scratch.last_mut().unwrap(); unsafe fn cvt(v: float32x4_t) -> uint8x16_t { unsafe { let clamped = vminq_f32(v, vdupq_n_f32(1.0)); @@ -40,14 +42,14 @@ let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4; for i in (0..WIDE_TILE_WIDTH).step_by(4) { let chunk_ix = base_ix + i * 4; - let v0 = vld1q_f32_x4(self.scratch.as_ptr().add(i * 16)); - let v1 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 1) * 16)); + let v0 = vld1q_f32_x4(scratch.as_ptr().add(i * 16)); + let v1 = vld1q_f32_x4(scratch.as_ptr().add((i + 1) * 16)); let x0 = cvt2(v0.0, v1.0); let x1 = cvt2(v0.1, v1.1); let x2 = cvt2(v0.2, v1.2); let x3 = cvt2(v0.3, v1.3); - let v2 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 2) * 16)); - let v3 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 3) * 16)); + let v2 = vld1q_f32_x4(scratch.as_ptr().add((i + 2) * 16)); + let v3 = vld1q_f32_x4(scratch.as_ptr().add((i + 3) * 16)); let x4 = cvt2(v2.0, v3.0); let y0 = vuzp1q_u8(x0, x4); vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix), y0); @@ -68,24 +70,25 @@ } pub(crate) unsafe fn fill_simd(&mut self, x: usize, width: usize, color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); unsafe { let v_color = vld1q_f32(color.as_ptr()); let alpha = color[3]; if alpha == 1.0 { let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color); for i in x..x + width { - vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4); + vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4); } } else { let one_minus_alpha = vdupq_n_f32(1.0 - alpha); for i in x..x + width { let ix = (x + i) * 16; - let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix)); + let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix)); v.0 = vfmaq_f32(v_color, v.0, one_minus_alpha); v.1 = vfmaq_f32(v_color, v.1, one_minus_alpha); v.2 = vfmaq_f32(v_color, v.2, one_minus_alpha); v.3 = vfmaq_f32(v_color, v.3, one_minus_alpha); - vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v); + vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v); } } } @@ -99,6 +102,7 @@ alphas: &[u32], color: [f32; 4], ) { + let scratch = self.scratch.last_mut().unwrap(); unsafe { debug_assert!(alphas.len() >= width, "overflow of alphas buffer"); let v_color = vmulq_f32(vld1q_f32(color.as_ptr()), vdupq_n_f32(1.0 / 255.0)); @@ -111,13 +115,13 @@ let a4 = vreinterpretq_u32_u16(vzip1q_u16(a3, vdupq_n_u16(0))); let alpha = vcvtq_f32_u32(a4); let ix = (x + i) * 16; - let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix)); + let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix)); let one_minus_alpha = vfmsq_laneq_f32(vdupq_n_f32(1.0), alpha, v_color, 3); v.0 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 0), v.0, one_minus_alpha, 0); v.1 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 1), v.1, one_minus_alpha, 1); v.2 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 2), v.2, one_minus_alpha, 2); v.3 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 3), v.3, one_minus_alpha, 3); - vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v); + vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v); } } }
diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs index 5e71153..9d8dbcd 100644 --- a/vello_hybrid/src/wide_tile.rs +++ b/vello_hybrid/src/wide_tile.rs
@@ -6,15 +6,24 @@ pub(crate) const WIDE_TILE_WIDTH: usize = 256; pub(crate) const STRIP_HEIGHT: usize = 4; +#[derive(Debug)] pub(crate) struct WideTile { pub(crate) bg: AlphaColor<Srgb>, pub(crate) cmds: Vec<Cmd>, + n_zero_clip: usize, + n_clip: usize, } #[derive(Debug)] pub(crate) enum Cmd { Fill(CmdFill), Strip(CmdStrip), + /// Pushes a new transparent buffer to the clip stack. + PushClip, + /// Pops the clip stack. + PopClip, + ClipFill(CmdClipFill), + ClipStrip(CmdClipStrip), } #[derive(Debug)] @@ -33,26 +42,111 @@ pub(crate) color: AlphaColor<Srgb>, } +/// Same as fill, but copies top of clip stack to next on stack. +#[derive(Debug)] +pub(crate) struct CmdClipFill { + pub(crate) x: u32, + pub(crate) width: u32, + // TODO: this should probably get at least an alpha for group opacity + // Also, this is where blend modes go. +} + +/// Same as strip, but composites top of clip stack to next on stack. +#[derive(Debug)] +pub(crate) struct CmdClipStrip { + pub(crate) x: u32, + pub(crate) width: u32, + pub(crate) alpha_ix: usize, + // See `CmdClipFill` for blending extension points +} + impl Default for WideTile { fn default() -> Self { Self { bg: AlphaColor::TRANSPARENT, cmds: vec![], + n_zero_clip: 0, + n_clip: 0, } } } impl WideTile { pub(crate) fn fill(&mut self, x: u32, width: u32, color: AlphaColor<Srgb>) { - if x == 0 && width == WIDE_TILE_WIDTH as u32 && color.components[3] == 1.0 { - self.cmds.clear(); - self.bg = color; - } else { - self.cmds.push(Cmd::Fill(CmdFill { x, width, color })); + if !self.is_zero_clip() { + // Note that we could be more aggressive in optimizing a whole-tile opaque fill + // even with a clip stack. It would be valid to elide all drawing commands from + // the enclosing clip push up to the fill. Further, we could extend the clip + // push command to include a background color, rather than always starting with + // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could + // be replaced with strip/fill with the color (the latter is true even with a + // non-opaque color). + // + // However, the extra cost of tracking such optimizations may outweigh the + // benefit, especially in hybrid mode with GPU painting. + if x == 0 + && width == WIDE_TILE_WIDTH as u32 + && color.components[3] == 1.0 + && self.n_clip == 0 + { + self.cmds.clear(); + self.bg = color; + } else { + self.cmds.push(Cmd::Fill(CmdFill { x, width, color })); + } + } + } + + pub(crate) fn strip(&mut self, cmd_strip: CmdStrip) { + if !self.is_zero_clip() { + self.cmds.push(Cmd::Strip(cmd_strip)); } } pub(crate) fn push(&mut self, cmd: Cmd) { self.cmds.push(cmd); } + + pub(crate) fn push_clip(&mut self) { + if !self.is_zero_clip() { + self.push(Cmd::PushClip); + self.n_clip += 1; + } + } + + pub(crate) fn pop_clip(&mut self) { + if !self.is_zero_clip() { + if matches!(self.cmds.last(), Some(Cmd::PushClip)) { + // Nothing was drawn inside the clip, elide it. + self.cmds.pop(); + } else { + self.push(Cmd::PopClip); + } + self.n_clip -= 1; + } + } + + pub(crate) fn push_zero_clip(&mut self) { + self.n_zero_clip += 1; + } + + pub(crate) fn pop_zero_clip(&mut self) { + self.n_zero_clip -= 1; + } + + pub(crate) fn is_zero_clip(&mut self) -> bool { + self.n_zero_clip > 0 + } + + pub(crate) fn clip_strip(&mut self, cmd_clip_strip: CmdClipStrip) { + if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) { + self.cmds.push(Cmd::ClipStrip(cmd_clip_strip)); + } + } + + pub(crate) fn clip_fill(&mut self, x: u32, width: u32) { + if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) { + self.cmds.push(Cmd::ClipFill(CmdClipFill { x, width })); + } + } }