shader/fine.wgsl - external/github.com/linebender/vello - Git at Google

 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

 // Fine rasterizer. This can run in simple (just path rendering) and full
 // modes, controllable by #define.

 // This is a cut'n'paste w/ backdrop.
 struct Tile {
     backdrop: i32,
     segments: u32,
 }

 #import segment
 #import config

 @group(0) @binding(0)
 var<uniform> config: Config;

 @group(0) @binding(1)
 var<storage> tiles: array<Tile>;

 @group(0) @binding(2)
 var<storage> segments: array<Segment>;

 #ifdef full

 #import blend
 #import ptcl

 let GRADIENT_WIDTH = 512;

 @group(0) @binding(3)
 var output: texture_storage_2d<rgba8unorm, write>;

 @group(0) @binding(4)
 var<storage> ptcl: array<u32>;

 @group(0) @binding(5)
 var gradients: texture_2d<f32>;

 @group(0) @binding(6)
 var<storage> info: array<u32>;

 @group(0) @binding(7)
 var image_atlas: texture_2d<f32>;

 #ifdef msaa8
 let MASK_WIDTH = 32u;
 let MASK_HEIGHT = 32u;
 // This might be better in uniform, but that has 16 byte alignment
 @group(0) @binding(8)
 var<storage> mask_lut: array<u32, 256u>;
 #endif

 #ifdef msaa16
 let MASK_WIDTH = 64u;
 let MASK_HEIGHT = 64u;
 // This might be better in uniform, but that has 16 byte alignment
 @group(0) @binding(8)
 var<storage> mask_lut: array<u32, 2048u>;
 #endif

 fn read_fill(cmd_ix: u32) -> CmdFill {
     let size_and_rule = ptcl[cmd_ix + 1u];
     let seg_data = ptcl[cmd_ix + 2u];
     let backdrop = i32(ptcl[cmd_ix + 3u]);
     return CmdFill(size_and_rule, seg_data, backdrop);
 }

 fn read_color(cmd_ix: u32) -> CmdColor {
     let rgba_color = ptcl[cmd_ix + 1u];
     return CmdColor(rgba_color);
 }

 fn read_lin_grad(cmd_ix: u32) -> CmdLinGrad {
     let index = ptcl[cmd_ix + 1u];
     let info_offset = ptcl[cmd_ix + 2u];
     let line_x = bitcast<f32>(info[info_offset]);
     let line_y = bitcast<f32>(info[info_offset + 1u]);
     let line_c = bitcast<f32>(info[info_offset + 2u]);
     return CmdLinGrad(index, line_x, line_y, line_c);
 }

 fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad {
     let index = ptcl[cmd_ix + 1u];
     let info_offset = ptcl[cmd_ix + 2u];
     let m0 = bitcast<f32>(info[info_offset]);
     let m1 = bitcast<f32>(info[info_offset + 1u]);
     let m2 = bitcast<f32>(info[info_offset + 2u]);
     let m3 = bitcast<f32>(info[info_offset + 3u]);
     let matrx = vec4(m0, m1, m2, m3);
     let xlat = vec2(bitcast<f32>(info[info_offset + 4u]), bitcast<f32>(info[info_offset + 5u]));
     let c1 = vec2(bitcast<f32>(info[info_offset + 6u]), bitcast<f32>(info[info_offset + 7u]));
     let ra = bitcast<f32>(info[info_offset + 8u]);
     let roff = bitcast<f32>(info[info_offset + 9u]);
     return CmdRadGrad(index, matrx, xlat, c1, ra, roff);
 }

 fn read_image(cmd_ix: u32) -> CmdImage {
     let info_offset = ptcl[cmd_ix + 1u];
     let m0 = bitcast<f32>(info[info_offset]);
     let m1 = bitcast<f32>(info[info_offset + 1u]);
     let m2 = bitcast<f32>(info[info_offset + 2u]);
     let m3 = bitcast<f32>(info[info_offset + 3u]);
     let matrx = vec4(m0, m1, m2, m3);
     let xlat = vec2(bitcast<f32>(info[info_offset + 4u]), bitcast<f32>(info[info_offset + 5u]));
     let xy = info[info_offset + 6u];
     let width_height = info[info_offset + 7u];
     // The following are not intended to be bitcasts
     let x = f32(xy >> 16u);
     let y = f32(xy & 0xffffu);
     let width = f32(width_height >> 16u);
     let height = f32(width_height & 0xffffu);
     return CmdImage(matrx, xlat, vec2(x, y), vec2(width, height));
 }

 fn read_end_clip(cmd_ix: u32) -> CmdEndClip {
     let blend = ptcl[cmd_ix + 1u];
     let alpha = bitcast<f32>(ptcl[cmd_ix + 2u]);
     return CmdEndClip(blend, alpha);
 }

 #else

 @group(0) @binding(3)
 var output: texture_storage_2d<r8, write>;

 #endif

 let PIXELS_PER_THREAD = 4u;

 fn fill_path(seg_data: u32, n_segs: u32, backdrop: i32, xy: vec2<f32>, even_odd: bool) -> array<f32, PIXELS_PER_THREAD> {
     var area: array<f32, PIXELS_PER_THREAD>;
     let backdrop_f = f32(backdrop);
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
         area[i] = backdrop_f;
     }
     for (var i = 0u; i < n_segs; i++) {
         let seg_off = seg_data + i;
         let segment = segments[seg_off];
         let y = segment.origin.y - xy.y;
         let y0 = clamp(y, 0.0, 1.0);
         let y1 = clamp(y + segment.delta.y, 0.0, 1.0);
         let dy = y0 - y1;
         if dy != 0.0 {
             let vec_y_recip = 1.0 / segment.delta.y;
             let t0 = (y0 - y) * vec_y_recip;
             let t1 = (y1 - y) * vec_y_recip;
             let startx = segment.origin.x - xy.x;
             let x0 = startx + t0 * segment.delta.x;
             let x1 = startx + t1 * segment.delta.x;
             let xmin0 = min(x0, x1);
             let xmax0 = max(x0, x1);
             for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                 let i_f = f32(i);
                 let xmin = min(xmin0 - i_f, 1.0) - 1.0e-6;
                 let xmax = xmax0 - i_f;
                 let b = min(xmax, 1.0);
                 let c = max(b, 0.0);
                 let d = max(xmin, 0.0);
                 let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
                 area[i] += a * dy;
             }
         }
         let y_edge = sign(segment.delta.x) * clamp(xy.y - segment.y_edge + 1.0, 0.0, 1.0);
         for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
             area[i] += y_edge;
         }
     }
     if even_odd {
         // even-odd winding rule
         for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
             let a = area[i];
             area[i] = abs(a - 2.0 * round(0.5 * a));
         }
     } else {
         // non-zero winding rule
         for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
             area[i] = min(abs(area[i]), 1.0);
         }
     }
     return area;
 }

 let WG_SIZE = 64u;
 var<workgroup> sh_count: array<u32, WG_SIZE>;

 #ifdef msaa8
 let SH_SAMPLES_SIZE = 256u;
 let SAMPLE_WORDS_PER_PIXEL = 1u;
 #endif

 #ifdef msaa16
 let SH_SAMPLES_SIZE = 512u;
 let SAMPLE_WORDS_PER_PIXEL = 2u;
 #endif

 // This is 8 winding numbers packed to a u32, 4 bits per sample
 var<workgroup> sh_winding: array<atomic<u32>, 32u>;
 // Same packing, one group of 8 per pixel
 var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
 // Same packing, accumulating winding numbers for vertical edge crossings
 var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;

 // number of integer cells spanned by interval defined by a, b
 fn span(a: f32, b: f32) -> u32 {
     return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
 }

 let SEG_SIZE = 5u;

 // New multisampled algorithm.
 fn fill_path_ms(seg_data: u32, n_segs: u32, backdrop: i32, wg_id: vec2<u32>, local_id: vec2<u32>, even_odd: bool) -> array<f32, PIXELS_PER_THREAD> {
     let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
     let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
     if th_ix < 32u {
         if th_ix < 2u {
             atomicStore(&sh_winding_y[th_ix], 0x88888888u);
         }
         atomicStore(&sh_winding[th_ix], 0x88888888u);
     }
     let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
     for (var i = 0u; i < sample_count; i++) {
         atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
     }
     workgroupBarrier();
     let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
     for (var batch = 0u; batch < n_batch; batch++) {
         let seg_ix = batch * WG_SIZE + th_ix;
         let seg_off = seg_data + seg_ix;
         var count = 0u;
         let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
         // TODO: might save a register rewriting this in terms of limit
         if th_ix < slice_size {
             let segment = segments[seg_off];
             // Note: coords relative to tile origin probably a good idea in coarse path,
             // especially as f16 would work. But keeping existing scheme for compatibility.
             let xy0 = segment.origin - tile_origin;
             let xy1 = xy0 + segment.delta;
             var y_edge_f = f32(TILE_HEIGHT);
             var delta = select(-1, 1, xy1.x <= xy0.x);
             if xy0.x == 0.0 && xy1.x == 0.0 {
                 if xy0.y == 0.0 {
                     y_edge_f = 0.0;
                 } else if xy1.y == 0.0 {
                     y_edge_f = 0.0;
                     delta = -delta;
                 }
             } else {
                 if xy0.x == 0.0 {
                     if xy0.y != 0.0 {
                         y_edge_f = xy0.y;
                     }
                 } else if xy1.x == 0.0 && xy1.y != 0.0 {
                     y_edge_f = xy1.y;
                 }
                 // discard horizontal lines aligned to pixel grid
                 if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
                     count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
                 }
             }
             let y_edge = u32(ceil(y_edge_f));
             if y_edge < TILE_HEIGHT {
                 atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
             }
         }
         // workgroup prefix sum of counts
         sh_count[th_ix] = count;
         let lg_n = firstLeadingBit(slice_size * 2u - 1u);
         for (var i = 0u; i < lg_n; i++) {
             workgroupBarrier();
             if th_ix >= 1u << i {
                 count += sh_count[th_ix - (1u << i)];
             }
             workgroupBarrier();
             sh_count[th_ix] = count;
         }
 #ifdef have_uniform
         let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
 #else
         workgroupBarrier();
         let total = sh_count[slice_size - 1u];
 #endif
         for (var i = th_ix; i < total; i += WG_SIZE) {
             // binary search to find pixel
             var lo = 0u;
             var hi = slice_size;
             let goal = i;
             while hi > lo + 1u {
                 let mid = (lo + hi) >> 1u;
                 if goal >= sh_count[mid - 1u] {
                     lo = mid;
                 } else {
                     hi = mid;
                 }
             }
             let el_ix = lo;
             let last_pixel = i + 1u == sh_count[el_ix];
             let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
             let seg_off = seg_data + batch * WG_SIZE + el_ix;
             let segment = segments[seg_off];
             let xy0_in = segment.origin - tile_origin;
             let xy1_in = xy0_in + segment.delta;
             let is_down = xy1_in.y >= xy0_in.y;
             let xy0 = select(xy1_in, xy0_in, is_down);
             let xy1 = select(xy0_in, xy1_in, is_down);

             // Set up data for line rasterization
             // Note: this is duplicated work if total count exceeds a workgroup.
             // One alternative is to compute it in a separate dispatch.
             let dx = abs(xy1.x - xy0.x);
             let dy = xy1.y - xy0.y;
             let dy_dxdy = dy / (dx + dy);
             let a = dx / (dx + dy);
             let is_positive_slope = xy1.x >= xy0.x;
             let sign = select(-1.0, 1.0, is_positive_slope);
             let xt0 = floor(xy0.x * sign);
             let c = xy0.x * sign - xt0;
             // This has a special case in the JS code, but we should just not render
             let y0i = floor(xy0.y);
             let ytop = select(y0i + 1.0, ceil(xy0.y), xy0.y == xy1.y);
             let b = dy_dxdy * c + a * (ytop - xy0.y);
             let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
             // Use line equation to plot pixel coordinates

             let zf = a * f32(sub_ix) + b;
             let z = floor(zf);
             let x = x0i + i32(sign * z);
             let y = i32(y0i) + i32(sub_ix) - i32(z);
             var is_delta: bool;
             // We need to adjust winding number if slope is positive and there
             // is a crossing at the left edge of the pixel.
             var is_bump = false;
             let zp = floor(a * f32(sub_ix - 1u) + b);
             if sub_ix == 0u {
                 is_delta = y0i == xy0.y && y0i != xy1.y;
                 is_bump = xy0.x == 0.0;
             } else {
                 is_delta = z == zp;
                 is_bump = is_positive_slope && !is_delta;
             }
             let pix_ix = u32(y) * TILE_WIDTH + u32(x);
             if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
                 let delta_pix = pix_ix + 1u;
                 if is_delta {
                     let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
                     atomicAdd(&sh_winding[delta_pix >> 3u], delta);
                 }
             }
             // Apply sample mask
             let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
             let half_height = f32(MASK_HEIGHT / 2u);
             let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
             let mask_col = floor((zf - z) * f32(MASK_WIDTH));
             let mask_ix = mask_block + u32(mask_row + mask_col);
 #ifdef msaa8
             var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
             mask &= 0xffu;
             // Intersect with y half-plane masks
             if sub_ix == 0u && !is_bump {
                 let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
                 mask &= 0xffu << mask_shift;
             }
             if last_pixel && xy1.x != 0.0 {
                 let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
                 mask &= ~(0xffu << mask_shift);
             }
             let mask_a = mask | (mask << 6u);
             let mask_b = mask_a | (mask_a << 12u);
             let mask_exp = (mask_b & 0x1010101u) | ((mask_b << 3u) & 0x10101010u);
             var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
             if is_bump {
                 mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
             }
             atomicAdd(&sh_samples[pix_ix], mask_signed);
 #endif
 #ifdef msaa16
             var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
             mask &= 0xffffu;
             // Intersect with y half-plane masks
             if sub_ix == 0u && !is_bump {
                 let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
                 mask &= 0xffffu << mask_shift;
             }
             if last_pixel && xy1.x != 0.0 {
                 let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
                 mask &= ~(0xffffu << mask_shift);
             }
             let mask0 = mask & 0xffu;
             let mask0_a = mask0 | (mask0 << 6u);
             let mask0_b = mask0_a | (mask0_a << 12u);
             let mask0_exp = (mask0_b & 0x1010101u) | ((mask0_b << 3u) & 0x10101010u);
             var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
             let mask1 = (mask >> 8u) & 0xffu;
             let mask1_a = mask1 | (mask1 << 6u);
             let mask1_b = mask1_a | (mask1_a << 12u);
             let mask1_exp = (mask1_b & 0x1010101u) | ((mask1_b << 3u) & 0x10101010u);
             var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
             if is_bump {
                 let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
                 mask0_signed += bump_delta;
                 mask1_signed += bump_delta;
             }
             atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
             atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
 #endif
         }
         workgroupBarrier();
     }
     var area: array<f32, PIXELS_PER_THREAD>;
     let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
     var packed_w = atomicLoad(&sh_winding[major]);
     // Prefix sum of packed 4 bit values within u32
     packed_w += (packed_w - 0x8888888u) << 4u;
     packed_w += (packed_w - 0x888888u) << 8u;
     packed_w += (packed_w - 0x8888u) << 16u;
     // Note: could probably do bias in one go, but it would be inscrutable
     if (major & 1u) != 0u {
         // We could use shmem to communicate the value from another thread;
         // if we had subgroups that would almost certainly be the most
         // efficient way. But we just calculate again for simplicity.
         var last_packed = atomicLoad(&sh_winding[major - 1u]);
         last_packed += (last_packed - 0x8888888u) << 4u;
         last_packed += (last_packed - 0x888888u) << 8u;
         last_packed += (last_packed - 0x8888u) << 16u;
         let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
         packed_w += bump;
     }
     var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
     packed_y += (packed_y - 0x8888888u) << 4u;
     packed_y += (packed_y - 0x888888u) << 8u;
     packed_y += (packed_y - 0x8888u) << 16u;
     if th_ix == 0u {
         atomicStore(&sh_winding_y[0], packed_y);
     }
     workgroupBarrier();
     var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
     if local_id.y >= 8u {
         wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
     }

     for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
         let pix_ix = th_ix * PIXELS_PER_THREAD + i;
         let minor = pix_ix & 7u;
         //let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
         // TODO: math might be off here
         let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(backdrop);
         if expected_zero >= 16u {
             area[i] = 1.0;
         } else {
 #ifdef msaa8
             let samples = atomicLoad(&sh_samples[pix_ix]);
             let xored = (expected_zero * 0x11111111u) ^ samples;
             // Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
             let xored2 = xored | (xored * 2u);
             let xored4 = xored2 | (xored2 * 4u);
             area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
 #endif
 #ifdef msaa16
             let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
             let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
             let xored0 = (expected_zero * 0x11111111u) ^ samples0;
             let xored0_2 = xored0 | (xored0 * 2u);
             let xored1 = (expected_zero * 0x11111111u) ^ samples1;
             let xored1_2 = xored1 | (xored1 >> 1u);
             let xored2 = (xored0_2 & 0xAAAAAAAAu) | (xored1_2 & 0x55555555u);
             let xored4 = xored2 | (xored2 * 4u);
             area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
 #endif
         }
     }
     return area;
 }

 // The X size should be 16 / PIXELS_PER_THREAD
 @compute @workgroup_size(4, 16)
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>,
     @builtin(local_invocation_id) local_id: vec3<u32>,
     @builtin(workgroup_id) wg_id: vec3<u32>,
 ) {
     let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
     let xy = vec2(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
 #ifdef full
     var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
         rgba[i] = unpack4x8unorm(config.base_color).wzyx;
     }
     var blend_stack: array<array<u32, PIXELS_PER_THREAD>, BLEND_STACK_SPLIT>;
     var clip_depth = 0u;
     var area: array<f32, PIXELS_PER_THREAD>;
     var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
     let blend_offset = ptcl[cmd_ix];
     cmd_ix += 1u;
     // main interpretation loop
     while true {
         let tag = ptcl[cmd_ix];
         if tag == CMD_END {
             break;
         }
         switch tag {
             // CMD_FILL
             case 1u: {
                 let fill = read_fill(cmd_ix);
                 let n_segs = fill.size_and_rule >> 1u;
                 let even_odd = (fill.size_and_rule & 1u) != 0u;
                 //area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd);
                 area = fill_path_ms(fill.seg_data, n_segs, fill.backdrop, wg_id.xy, local_id.xy, even_odd);
                 cmd_ix += 4u;
             }
             // CMD_STROKE
             case 2u: {
                 // Stroking in fine rasterization is disabled, as strokes will be expanded
                 // to fills earlier in the pipeline. This implementation is a stub, just to
                 // keep the shader from crashing.
                 for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
                     area[i] = 0.0;
                 }
                 cmd_ix += 3u;
             }
             // CMD_SOLID
             case 3u: {
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                     area[i] = 1.0;
                 }
                 cmd_ix += 1u;
             }
             // CMD_COLOR
             case 5u: {
                 let color = read_color(cmd_ix);
                 let fg = unpack4x8unorm(color.rgba_color).wzyx;
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                     let fg_i = fg * area[i];
                     rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
                 }
                 cmd_ix += 2u;
             }
             // CMD_LIN_GRAD
             case 6u: {
                 let lin = read_lin_grad(cmd_ix);
                 let d = lin.line_x * xy.x + lin.line_y * xy.y + lin.line_c;
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                     let my_d = d + lin.line_x * f32(i);
                     let x = i32(round(clamp(my_d, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
                     let fg_rgba = textureLoad(gradients, vec2(x, i32(lin.index)), 0);
                     let fg_i = fg_rgba * area[i];
                     rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
                 }
                 cmd_ix += 3u;
             }
             // CMD_RAD_GRAD
             case 7u: {
                 let rad = read_rad_grad(cmd_ix);
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                     let my_xy = vec2(xy.x + f32(i), xy.y);
                     // TODO: can hoist y, but for now stick to the GLSL version
                     let xy_xformed = rad.matrx.xy * my_xy.x + rad.matrx.zw * my_xy.y + rad.xlat;
                     let ba = dot(xy_xformed, rad.c1);
                     let ca = rad.ra * dot(xy_xformed, xy_xformed);
                     let t = sqrt(ba * ba + ca) - ba - rad.roff;
                     let x = i32(round(clamp(t, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
                     let fg_rgba = textureLoad(gradients, vec2(x, i32(rad.index)), 0);
                     let fg_i = fg_rgba * area[i];
                     rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
                 }
                 cmd_ix += 3u;
             }
             // CMD_IMAGE
             case 8u: {
                 let image = read_image(cmd_ix);
                 let atlas_extents = image.atlas_offset + image.extents;
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                     let my_xy = vec2(xy.x + f32(i), xy.y);
                     let atlas_uv = image.matrx.xy * my_xy.x + image.matrx.zw * my_xy.y + image.xlat + image.atlas_offset;
                     // This currently clips to the image bounds. TODO: extend modes
                     if all(atlas_uv < atlas_extents) && area[i] != 0.0 {
                         let uv_quad = vec4(max(floor(atlas_uv), image.atlas_offset), min(ceil(atlas_uv), atlas_extents));
                         let uv_frac = fract(atlas_uv);
                         let a = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.xy), 0));
                         let b = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.xw), 0));
                         let c = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.zy), 0));
                         let d = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.zw), 0));
                         let fg_rgba = mix(mix(a, b, uv_frac.y), mix(c, d, uv_frac.y), uv_frac.x);
                         let fg_i = fg_rgba * area[i];
                         rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
                     }
                 }
                 cmd_ix += 2u;
             }
             // CMD_BEGIN_CLIP
             case 9u: {
                 if clip_depth < BLEND_STACK_SPLIT {
                     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                         blend_stack[clip_depth][i] = pack4x8unorm(rgba[i]);
                         rgba[i] = vec4(0.0);
                     }
                 } else {
                     // TODO: spill to memory
                 }
                 clip_depth += 1u;
                 cmd_ix += 1u;
             }
             // CMD_END_CLIP
             case 10u: {
                 let end_clip = read_end_clip(cmd_ix);
                 clip_depth -= 1u;
                 for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
                     var bg_rgba: u32;
                     if clip_depth < BLEND_STACK_SPLIT {
                         bg_rgba = blend_stack[clip_depth][i];
                     } else {
                         // load from memory
                     }
                     let bg = unpack4x8unorm(bg_rgba);
                     let fg = rgba[i] * area[i] * end_clip.alpha;
                     rgba[i] = blend_mix_compose(bg, fg, end_clip.blend);
                 }
                 cmd_ix += 3u;
             }
             // CMD_JUMP
             case 11u: {
                 cmd_ix = ptcl[cmd_ix + 1u];
             }
             default: {}
         }
     }
     let xy_uint = vec2<u32>(xy);
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
         let coords = xy_uint + vec2(i, 0u);
         if coords.x < config.target_width && coords.y < config.target_height {
             let fg = rgba[i];
             // Max with a small epsilon to avoid NaNs
             let a_inv = 1.0 / max(fg.a, 1e-6);
             let rgba_sep = vec4(fg.rgb * a_inv, fg.a);
             textureStore(output, vec2<i32>(coords), rgba_sep);
         }
     }
 #else
     let tile = tiles[tile_ix];
     let area = fill_path(tile, xy);

     let xy_uint = vec2<u32>(xy);
     for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
         let coords = xy_uint + vec2(i, 0u);
         if coords.x < config.target_width && coords.y < config.target_height {
             textureStore(output, vec2<i32>(coords), vec4(area[i]));
         }
     }
 #endif
 }

 fn premul_alpha(rgba: vec4<f32>) -> vec4<f32> {
     return vec4(rgba.rgb * rgba.a, rgba.a);
 }
	// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

	// Fine rasterizer. This can run in simple (just path rendering) and full
	// modes, controllable by #define.

	// This is a cut'n'paste w/ backdrop.
	struct Tile {
	backdrop: i32,
	segments: u32,
	}

	#import segment
	#import config

	@group(0) @binding(0)
	var<uniform> config: Config;

	@group(0) @binding(1)
	var<storage> tiles: array<Tile>;

	@group(0) @binding(2)
	var<storage> segments: array<Segment>;

	#ifdef full

	#import blend
	#import ptcl

	let GRADIENT_WIDTH = 512;

	@group(0) @binding(3)
	var output: texture_storage_2d<rgba8unorm, write>;

	@group(0) @binding(4)
	var<storage> ptcl: array<u32>;

	@group(0) @binding(5)
	var gradients: texture_2d<f32>;

	@group(0) @binding(6)
	var<storage> info: array<u32>;

	@group(0) @binding(7)
	var image_atlas: texture_2d<f32>;

	#ifdef msaa8
	let MASK_WIDTH = 32u;
	let MASK_HEIGHT = 32u;
	// This might be better in uniform, but that has 16 byte alignment
	@group(0) @binding(8)
	var<storage> mask_lut: array<u32, 256u>;
	#endif

	#ifdef msaa16
	let MASK_WIDTH = 64u;
	let MASK_HEIGHT = 64u;
	// This might be better in uniform, but that has 16 byte alignment
	@group(0) @binding(8)
	var<storage> mask_lut: array<u32, 2048u>;
	#endif

	fn read_fill(cmd_ix: u32) -> CmdFill {
	let size_and_rule = ptcl[cmd_ix + 1u];
	let seg_data = ptcl[cmd_ix + 2u];
	let backdrop = i32(ptcl[cmd_ix + 3u]);
	return CmdFill(size_and_rule, seg_data, backdrop);
	}

	fn read_color(cmd_ix: u32) -> CmdColor {
	let rgba_color = ptcl[cmd_ix + 1u];
	return CmdColor(rgba_color);
	}

	fn read_lin_grad(cmd_ix: u32) -> CmdLinGrad {
	let index = ptcl[cmd_ix + 1u];
	let info_offset = ptcl[cmd_ix + 2u];
	let line_x = bitcast<f32>(info[info_offset]);
	let line_y = bitcast<f32>(info[info_offset + 1u]);
	let line_c = bitcast<f32>(info[info_offset + 2u]);
	return CmdLinGrad(index, line_x, line_y, line_c);
	}

	fn read_rad_grad(cmd_ix: u32) -> CmdRadGrad {
	let index = ptcl[cmd_ix + 1u];
	let info_offset = ptcl[cmd_ix + 2u];
	let m0 = bitcast<f32>(info[info_offset]);
	let m1 = bitcast<f32>(info[info_offset + 1u]);
	let m2 = bitcast<f32>(info[info_offset + 2u]);
	let m3 = bitcast<f32>(info[info_offset + 3u]);
	let matrx = vec4(m0, m1, m2, m3);
	let xlat = vec2(bitcast<f32>(info[info_offset + 4u]), bitcast<f32>(info[info_offset + 5u]));
	let c1 = vec2(bitcast<f32>(info[info_offset + 6u]), bitcast<f32>(info[info_offset + 7u]));
	let ra = bitcast<f32>(info[info_offset + 8u]);
	let roff = bitcast<f32>(info[info_offset + 9u]);
	return CmdRadGrad(index, matrx, xlat, c1, ra, roff);
	}

	fn read_image(cmd_ix: u32) -> CmdImage {
	let info_offset = ptcl[cmd_ix + 1u];
	let m0 = bitcast<f32>(info[info_offset]);
	let m1 = bitcast<f32>(info[info_offset + 1u]);
	let m2 = bitcast<f32>(info[info_offset + 2u]);
	let m3 = bitcast<f32>(info[info_offset + 3u]);
	let matrx = vec4(m0, m1, m2, m3);
	let xlat = vec2(bitcast<f32>(info[info_offset + 4u]), bitcast<f32>(info[info_offset + 5u]));
	let xy = info[info_offset + 6u];
	let width_height = info[info_offset + 7u];
	// The following are not intended to be bitcasts
	let x = f32(xy >> 16u);
	let y = f32(xy & 0xffffu);
	let width = f32(width_height >> 16u);
	let height = f32(width_height & 0xffffu);
	return CmdImage(matrx, xlat, vec2(x, y), vec2(width, height));
	}

	fn read_end_clip(cmd_ix: u32) -> CmdEndClip {
	let blend = ptcl[cmd_ix + 1u];
	let alpha = bitcast<f32>(ptcl[cmd_ix + 2u]);
	return CmdEndClip(blend, alpha);
	}

	#else

	@group(0) @binding(3)
	var output: texture_storage_2d<r8, write>;

	#endif

	let PIXELS_PER_THREAD = 4u;

	fn fill_path(seg_data: u32, n_segs: u32, backdrop: i32, xy: vec2<f32>, even_odd: bool) -> array<f32, PIXELS_PER_THREAD> {
	var area: array<f32, PIXELS_PER_THREAD>;
	let backdrop_f = f32(backdrop);
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	area[i] = backdrop_f;
	}
	for (var i = 0u; i < n_segs; i++) {
	let seg_off = seg_data + i;
	let segment = segments[seg_off];
	let y = segment.origin.y - xy.y;
	let y0 = clamp(y, 0.0, 1.0);
	let y1 = clamp(y + segment.delta.y, 0.0, 1.0);
	let dy = y0 - y1;
	if dy != 0.0 {
	let vec_y_recip = 1.0 / segment.delta.y;
	let t0 = (y0 - y) * vec_y_recip;
	let t1 = (y1 - y) * vec_y_recip;
	let startx = segment.origin.x - xy.x;
	let x0 = startx + t0 * segment.delta.x;
	let x1 = startx + t1 * segment.delta.x;
	let xmin0 = min(x0, x1);
	let xmax0 = max(x0, x1);
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let i_f = f32(i);
	let xmin = min(xmin0 - i_f, 1.0) - 1.0e-6;
	let xmax = xmax0 - i_f;
	let b = min(xmax, 1.0);
	let c = max(b, 0.0);
	let d = max(xmin, 0.0);
	let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
	area[i] += a * dy;
	}
	}
	let y_edge = sign(segment.delta.x) * clamp(xy.y - segment.y_edge + 1.0, 0.0, 1.0);
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	area[i] += y_edge;
	}
	}
	if even_odd {
	// even-odd winding rule
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let a = area[i];
	area[i] = abs(a - 2.0 * round(0.5 * a));
	}
	} else {
	// non-zero winding rule
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	area[i] = min(abs(area[i]), 1.0);
	}
	}
	return area;
	}

	let WG_SIZE = 64u;
	var<workgroup> sh_count: array<u32, WG_SIZE>;

	#ifdef msaa8
	let SH_SAMPLES_SIZE = 256u;
	let SAMPLE_WORDS_PER_PIXEL = 1u;
	#endif

	#ifdef msaa16
	let SH_SAMPLES_SIZE = 512u;
	let SAMPLE_WORDS_PER_PIXEL = 2u;
	#endif

	// This is 8 winding numbers packed to a u32, 4 bits per sample
	var<workgroup> sh_winding: array<atomic<u32>, 32u>;
	// Same packing, one group of 8 per pixel
	var<workgroup> sh_samples: array<atomic<u32>, SH_SAMPLES_SIZE>;
	// Same packing, accumulating winding numbers for vertical edge crossings
	var<workgroup> sh_winding_y: array<atomic<u32>, 2u>;

	// number of integer cells spanned by interval defined by a, b
	fn span(a: f32, b: f32) -> u32 {
	return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
	}

	let SEG_SIZE = 5u;

	// New multisampled algorithm.
	fn fill_path_ms(seg_data: u32, n_segs: u32, backdrop: i32, wg_id: vec2<u32>, local_id: vec2<u32>, even_odd: bool) -> array<f32, PIXELS_PER_THREAD> {
	let tile_origin = vec2(f32(wg_id.x) * f32(TILE_HEIGHT), f32(wg_id.y) * f32(TILE_WIDTH));
	let th_ix = local_id.y * (TILE_WIDTH / PIXELS_PER_THREAD) + local_id.x;
	if th_ix < 32u {
	if th_ix < 2u {
	atomicStore(&sh_winding_y[th_ix], 0x88888888u);
	}
	atomicStore(&sh_winding[th_ix], 0x88888888u);
	}
	let sample_count = PIXELS_PER_THREAD * SAMPLE_WORDS_PER_PIXEL;
	for (var i = 0u; i < sample_count; i++) {
	atomicStore(&sh_samples[th_ix * sample_count + i], 0x88888888u);
	}
	workgroupBarrier();
	let n_batch = (n_segs + (WG_SIZE - 1u)) / WG_SIZE;
	for (var batch = 0u; batch < n_batch; batch++) {
	let seg_ix = batch * WG_SIZE + th_ix;
	let seg_off = seg_data + seg_ix;
	var count = 0u;
	let slice_size = min(n_segs - batch * WG_SIZE, WG_SIZE);
	// TODO: might save a register rewriting this in terms of limit
	if th_ix < slice_size {
	let segment = segments[seg_off];
	// Note: coords relative to tile origin probably a good idea in coarse path,
	// especially as f16 would work. But keeping existing scheme for compatibility.
	let xy0 = segment.origin - tile_origin;
	let xy1 = xy0 + segment.delta;
	var y_edge_f = f32(TILE_HEIGHT);
	var delta = select(-1, 1, xy1.x <= xy0.x);
	if xy0.x == 0.0 && xy1.x == 0.0 {
	if xy0.y == 0.0 {
	y_edge_f = 0.0;
	} else if xy1.y == 0.0 {
	y_edge_f = 0.0;
	delta = -delta;
	}
	} else {
	if xy0.x == 0.0 {
	if xy0.y != 0.0 {
	y_edge_f = xy0.y;
	}
	} else if xy1.x == 0.0 && xy1.y != 0.0 {
	y_edge_f = xy1.y;
	}
	// discard horizontal lines aligned to pixel grid
	if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
	count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
	}
	}
	let y_edge = u32(ceil(y_edge_f));
	if y_edge < TILE_HEIGHT {
	atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
	}
	}
	// workgroup prefix sum of counts
	sh_count[th_ix] = count;
	let lg_n = firstLeadingBit(slice_size * 2u - 1u);
	for (var i = 0u; i < lg_n; i++) {
	workgroupBarrier();
	if th_ix >= 1u << i {
	count += sh_count[th_ix - (1u << i)];
	}
	workgroupBarrier();
	sh_count[th_ix] = count;
	}
	#ifdef have_uniform
	let total = workgroupUniformLoad(&sh_count[slice_size - 1u]);
	#else
	workgroupBarrier();
	let total = sh_count[slice_size - 1u];
	#endif
	for (var i = th_ix; i < total; i += WG_SIZE) {
	// binary search to find pixel
	var lo = 0u;
	var hi = slice_size;
	let goal = i;
	while hi > lo + 1u {
	let mid = (lo + hi) >> 1u;
	if goal >= sh_count[mid - 1u] {
	lo = mid;
	} else {
	hi = mid;
	}
	}
	let el_ix = lo;
	let last_pixel = i + 1u == sh_count[el_ix];
	let sub_ix = i - select(0u, sh_count[el_ix - 1u], el_ix > 0u);
	let seg_off = seg_data + batch * WG_SIZE + el_ix;
	let segment = segments[seg_off];
	let xy0_in = segment.origin - tile_origin;
	let xy1_in = xy0_in + segment.delta;
	let is_down = xy1_in.y >= xy0_in.y;
	let xy0 = select(xy1_in, xy0_in, is_down);
	let xy1 = select(xy0_in, xy1_in, is_down);

	// Set up data for line rasterization
	// Note: this is duplicated work if total count exceeds a workgroup.
	// One alternative is to compute it in a separate dispatch.
	let dx = abs(xy1.x - xy0.x);
	let dy = xy1.y - xy0.y;
	let dy_dxdy = dy / (dx + dy);
	let a = dx / (dx + dy);
	let is_positive_slope = xy1.x >= xy0.x;
	let sign = select(-1.0, 1.0, is_positive_slope);
	let xt0 = floor(xy0.x * sign);
	let c = xy0.x * sign - xt0;
	// This has a special case in the JS code, but we should just not render
	let y0i = floor(xy0.y);
	let ytop = select(y0i + 1.0, ceil(xy0.y), xy0.y == xy1.y);
	let b = dy_dxdy * c + a * (ytop - xy0.y);
	let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
	// Use line equation to plot pixel coordinates

	let zf = a * f32(sub_ix) + b;
	let z = floor(zf);
	let x = x0i + i32(sign * z);
	let y = i32(y0i) + i32(sub_ix) - i32(z);
	var is_delta: bool;
	// We need to adjust winding number if slope is positive and there
	// is a crossing at the left edge of the pixel.
	var is_bump = false;
	let zp = floor(a * f32(sub_ix - 1u) + b);
	if sub_ix == 0u {
	is_delta = y0i == xy0.y && y0i != xy1.y;
	is_bump = xy0.x == 0.0;
	} else {
	is_delta = z == zp;
	is_bump = is_positive_slope && !is_delta;
	}
	let pix_ix = u32(y) * TILE_WIDTH + u32(x);
	if u32(x) < TILE_WIDTH - 1u && u32(y) < TILE_HEIGHT {
	let delta_pix = pix_ix + 1u;
	if is_delta {
	let delta = select(u32(-1), 1u, is_down) << ((delta_pix & 7u) << 2u);
	atomicAdd(&sh_winding[delta_pix >> 3u], delta);
	}
	}
	// Apply sample mask
	let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
	let half_height = f32(MASK_HEIGHT / 2u);
	let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
	let mask_col = floor((zf - z) * f32(MASK_WIDTH));
	let mask_ix = mask_block + u32(mask_row + mask_col);
	#ifdef msaa8
	var mask = mask_lut[mask_ix / 4u] >> ((mask_ix % 4u) * 8u);
	mask &= 0xffu;
	// Intersect with y half-plane masks
	if sub_ix == 0u && !is_bump {
	let mask_shift = u32(round(8.0 * (xy0.y - f32(y))));
	mask &= 0xffu << mask_shift;
	}
	if last_pixel && xy1.x != 0.0 {
	let mask_shift = u32(round(8.0 * (xy1.y - f32(y))));
	mask &= ~(0xffu << mask_shift);
	}
	let mask_a = mask \| (mask << 6u);
	let mask_b = mask_a \| (mask_a << 12u);
	let mask_exp = (mask_b & 0x1010101u) \| ((mask_b << 3u) & 0x10101010u);
	var mask_signed = select(mask_exp, u32(-i32(mask_exp)), is_down);
	if is_bump {
	mask_signed += select(u32(-0x11111111), 0x1111111u, is_down);
	}
	atomicAdd(&sh_samples[pix_ix], mask_signed);
	#endif
	#ifdef msaa16
	var mask = mask_lut[mask_ix / 2u] >> ((mask_ix % 2u) * 16u);
	mask &= 0xffffu;
	// Intersect with y half-plane masks
	if sub_ix == 0u && !is_bump {
	let mask_shift = u32(round(16.0 * (xy0.y - f32(y))));
	mask &= 0xffffu << mask_shift;
	}
	if last_pixel && xy1.x != 0.0 {
	let mask_shift = u32(round(16.0 * (xy1.y - f32(y))));
	mask &= ~(0xffffu << mask_shift);
	}
	let mask0 = mask & 0xffu;
	let mask0_a = mask0 \| (mask0 << 6u);
	let mask0_b = mask0_a \| (mask0_a << 12u);
	let mask0_exp = (mask0_b & 0x1010101u) \| ((mask0_b << 3u) & 0x10101010u);
	var mask0_signed = select(mask0_exp, u32(-i32(mask0_exp)), is_down);
	let mask1 = (mask >> 8u) & 0xffu;
	let mask1_a = mask1 \| (mask1 << 6u);
	let mask1_b = mask1_a \| (mask1_a << 12u);
	let mask1_exp = (mask1_b & 0x1010101u) \| ((mask1_b << 3u) & 0x10101010u);
	var mask1_signed = select(mask1_exp, u32(-i32(mask1_exp)), is_down);
	if is_bump {
	let bump_delta = select(u32(-0x11111111), 0x1111111u, is_down);
	mask0_signed += bump_delta;
	mask1_signed += bump_delta;
	}
	atomicAdd(&sh_samples[pix_ix * 2u], mask0_signed);
	atomicAdd(&sh_samples[pix_ix * 2u + 1u], mask1_signed);
	#endif
	}
	workgroupBarrier();
	}
	var area: array<f32, PIXELS_PER_THREAD>;
	let major = (th_ix * PIXELS_PER_THREAD) >> 3u;
	var packed_w = atomicLoad(&sh_winding[major]);
	// Prefix sum of packed 4 bit values within u32
	packed_w += (packed_w - 0x8888888u) << 4u;
	packed_w += (packed_w - 0x888888u) << 8u;
	packed_w += (packed_w - 0x8888u) << 16u;
	// Note: could probably do bias in one go, but it would be inscrutable
	if (major & 1u) != 0u {
	// We could use shmem to communicate the value from another thread;
	// if we had subgroups that would almost certainly be the most
	// efficient way. But we just calculate again for simplicity.
	var last_packed = atomicLoad(&sh_winding[major - 1u]);
	last_packed += (last_packed - 0x8888888u) << 4u;
	last_packed += (last_packed - 0x888888u) << 8u;
	last_packed += (last_packed - 0x8888u) << 16u;
	let bump = ((last_packed >> 28u) - 8u) * 0x11111111u;
	packed_w += bump;
	}
	var packed_y = atomicLoad(&sh_winding_y[local_id.y >> 3u]);
	packed_y += (packed_y - 0x8888888u) << 4u;
	packed_y += (packed_y - 0x888888u) << 8u;
	packed_y += (packed_y - 0x8888u) << 16u;
	if th_ix == 0u {
	atomicStore(&sh_winding_y[0], packed_y);
	}
	workgroupBarrier();
	var wind_y = (packed_y >> ((local_id.y & 7u) << 2u)) - 8u;
	if local_id.y >= 8u {
	wind_y += (atomicLoad(&sh_winding_y[0]) >> 28u) - 8u;
	}

	for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
	let pix_ix = th_ix * PIXELS_PER_THREAD + i;
	let minor = pix_ix & 7u;
	//let nonzero = ((packed_w >> (minor << 2u)) & 0xfu) != u32(8 + backdrop);
	// TODO: math might be off here
	let expected_zero = (((packed_w >> (minor * 4u)) + wind_y) & 0xfu) - u32(backdrop);
	if expected_zero >= 16u {
	area[i] = 1.0;
	} else {
	#ifdef msaa8
	let samples = atomicLoad(&sh_samples[pix_ix]);
	let xored = (expected_zero * 0x11111111u) ^ samples;
	// Each 4-bit nibble in xored is 0 for winding = 0, nonzero otherwise
	let xored2 = xored \| (xored * 2u);
	let xored4 = xored2 \| (xored2 * 4u);
	area[i] = f32(countOneBits(xored4 & 0x88888888u)) * 0.125;
	#endif
	#ifdef msaa16
	let samples0 = atomicLoad(&sh_samples[pix_ix * 2u]);
	let samples1 = atomicLoad(&sh_samples[pix_ix * 2u + 1u]);
	let xored0 = (expected_zero * 0x11111111u) ^ samples0;
	let xored0_2 = xored0 \| (xored0 * 2u);
	let xored1 = (expected_zero * 0x11111111u) ^ samples1;
	let xored1_2 = xored1 \| (xored1 >> 1u);
	let xored2 = (xored0_2 & 0xAAAAAAAAu) \| (xored1_2 & 0x55555555u);
	let xored4 = xored2 \| (xored2 * 4u);
	area[i] = f32(countOneBits(xored4 & 0xCCCCCCCCu)) * 0.0625;
	#endif
	}
	}
	return area;
	}

	// The X size should be 16 / PIXELS_PER_THREAD
	@compute @workgroup_size(4, 16)
	fn main(
	@builtin(global_invocation_id) global_id: vec3<u32>,
	@builtin(local_invocation_id) local_id: vec3<u32>,
	@builtin(workgroup_id) wg_id: vec3<u32>,
	) {
	let tile_ix = wg_id.y * config.width_in_tiles + wg_id.x;
	let xy = vec2(f32(global_id.x * PIXELS_PER_THREAD), f32(global_id.y));
	#ifdef full
	var rgba: array<vec4<f32>, PIXELS_PER_THREAD>;
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	rgba[i] = unpack4x8unorm(config.base_color).wzyx;
	}
	var blend_stack: array<array<u32, PIXELS_PER_THREAD>, BLEND_STACK_SPLIT>;
	var clip_depth = 0u;
	var area: array<f32, PIXELS_PER_THREAD>;
	var cmd_ix = tile_ix * PTCL_INITIAL_ALLOC;
	let blend_offset = ptcl[cmd_ix];
	cmd_ix += 1u;
	// main interpretation loop
	while true {
	let tag = ptcl[cmd_ix];
	if tag == CMD_END {
	break;
	}
	switch tag {
	// CMD_FILL
	case 1u: {
	let fill = read_fill(cmd_ix);
	let n_segs = fill.size_and_rule >> 1u;
	let even_odd = (fill.size_and_rule & 1u) != 0u;
	//area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd);
	area = fill_path_ms(fill.seg_data, n_segs, fill.backdrop, wg_id.xy, local_id.xy, even_odd);
	cmd_ix += 4u;
	}
	// CMD_STROKE
	case 2u: {
	// Stroking in fine rasterization is disabled, as strokes will be expanded
	// to fills earlier in the pipeline. This implementation is a stub, just to
	// keep the shader from crashing.
	for (var i = 0u; i < PIXELS_PER_THREAD; i++) {
	area[i] = 0.0;
	}
	cmd_ix += 3u;
	}
	// CMD_SOLID
	case 3u: {
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	area[i] = 1.0;
	}
	cmd_ix += 1u;
	}
	// CMD_COLOR
	case 5u: {
	let color = read_color(cmd_ix);
	let fg = unpack4x8unorm(color.rgba_color).wzyx;
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let fg_i = fg * area[i];
	rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
	}
	cmd_ix += 2u;
	}
	// CMD_LIN_GRAD
	case 6u: {
	let lin = read_lin_grad(cmd_ix);
	let d = lin.line_x * xy.x + lin.line_y * xy.y + lin.line_c;
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let my_d = d + lin.line_x * f32(i);
	let x = i32(round(clamp(my_d, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
	let fg_rgba = textureLoad(gradients, vec2(x, i32(lin.index)), 0);
	let fg_i = fg_rgba * area[i];
	rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
	}
	cmd_ix += 3u;
	}
	// CMD_RAD_GRAD
	case 7u: {
	let rad = read_rad_grad(cmd_ix);
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let my_xy = vec2(xy.x + f32(i), xy.y);
	// TODO: can hoist y, but for now stick to the GLSL version
	let xy_xformed = rad.matrx.xy * my_xy.x + rad.matrx.zw * my_xy.y + rad.xlat;
	let ba = dot(xy_xformed, rad.c1);
	let ca = rad.ra * dot(xy_xformed, xy_xformed);
	let t = sqrt(ba * ba + ca) - ba - rad.roff;
	let x = i32(round(clamp(t, 0.0, 1.0) * f32(GRADIENT_WIDTH - 1)));
	let fg_rgba = textureLoad(gradients, vec2(x, i32(rad.index)), 0);
	let fg_i = fg_rgba * area[i];
	rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
	}
	cmd_ix += 3u;
	}
	// CMD_IMAGE
	case 8u: {
	let image = read_image(cmd_ix);
	let atlas_extents = image.atlas_offset + image.extents;
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let my_xy = vec2(xy.x + f32(i), xy.y);
	let atlas_uv = image.matrx.xy * my_xy.x + image.matrx.zw * my_xy.y + image.xlat + image.atlas_offset;
	// This currently clips to the image bounds. TODO: extend modes
	if all(atlas_uv < atlas_extents) && area[i] != 0.0 {
	let uv_quad = vec4(max(floor(atlas_uv), image.atlas_offset), min(ceil(atlas_uv), atlas_extents));
	let uv_frac = fract(atlas_uv);
	let a = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.xy), 0));
	let b = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.xw), 0));
	let c = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.zy), 0));
	let d = premul_alpha(textureLoad(image_atlas, vec2<i32>(uv_quad.zw), 0));
	let fg_rgba = mix(mix(a, b, uv_frac.y), mix(c, d, uv_frac.y), uv_frac.x);
	let fg_i = fg_rgba * area[i];
	rgba[i] = rgba[i] * (1.0 - fg_i.a) + fg_i;
	}
	}
	cmd_ix += 2u;
	}
	// CMD_BEGIN_CLIP
	case 9u: {
	if clip_depth < BLEND_STACK_SPLIT {
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	blend_stack[clip_depth][i] = pack4x8unorm(rgba[i]);
	rgba[i] = vec4(0.0);
	}
	} else {
	// TODO: spill to memory
	}
	clip_depth += 1u;
	cmd_ix += 1u;
	}
	// CMD_END_CLIP
	case 10u: {
	let end_clip = read_end_clip(cmd_ix);
	clip_depth -= 1u;
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	var bg_rgba: u32;
	if clip_depth < BLEND_STACK_SPLIT {
	bg_rgba = blend_stack[clip_depth][i];
	} else {
	// load from memory
	}
	let bg = unpack4x8unorm(bg_rgba);
	let fg = rgba[i] * area[i] * end_clip.alpha;
	rgba[i] = blend_mix_compose(bg, fg, end_clip.blend);
	}
	cmd_ix += 3u;
	}
	// CMD_JUMP
	case 11u: {
	cmd_ix = ptcl[cmd_ix + 1u];
	}
	default: {}
	}
	}
	let xy_uint = vec2<u32>(xy);
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let coords = xy_uint + vec2(i, 0u);
	if coords.x < config.target_width && coords.y < config.target_height {
	let fg = rgba[i];
	// Max with a small epsilon to avoid NaNs
	let a_inv = 1.0 / max(fg.a, 1e-6);
	let rgba_sep = vec4(fg.rgb * a_inv, fg.a);
	textureStore(output, vec2<i32>(coords), rgba_sep);
	}
	}
	#else
	let tile = tiles[tile_ix];
	let area = fill_path(tile, xy);

	let xy_uint = vec2<u32>(xy);
	for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
	let coords = xy_uint + vec2(i, 0u);
	if coords.x < config.target_width && coords.y < config.target_height {
	textureStore(output, vec2<i32>(coords), vec4(area[i]));
	}
	}
	#endif
	}

	fn premul_alpha(rgba: vec4<f32>) -> vec4<f32> {
	return vec4(rgba.rgb * rgba.a, rgba.a);
	}