Merge pull request #137 from linebender/draw_element

Add draw object stage
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index 3bc1365..4dd01ed 100644
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/backdrop.spv
Binary files differ
diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv
index c02f92c..b00e3cd 100644
--- a/piet-gpu/shader/backdrop_lg.spv
+++ b/piet-gpu/shader/backdrop_lg.spv
Binary files differ
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index 7c5c316..38d10b3 100644
--- a/piet-gpu/shader/binning.spv
+++ b/piet-gpu/shader/binning.spv
Binary files differ
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index c8b4858..1df1876 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -78,3 +78,19 @@
 build gen/pathseg.hlsl: hlsl gen/pathseg.spv
 build gen/pathseg.dxil: dxil gen/pathseg.hlsl
 build gen/pathseg.msl: msl gen/pathseg.spv
+
+build gen/draw_reduce.spv: glsl draw_reduce.comp | scene.h drawtag.h setup.h mem.h
+build gen/draw_reduce.hlsl: hlsl gen/draw_reduce.spv
+build gen/draw_reduce.dxil: dxil gen/draw_reduce.hlsl
+build gen/draw_reduce.msl: msl gen/draw_reduce.spv
+
+build gen/draw_root.spv: glsl draw_scan.comp | drawtag.h
+  flags = -DROOT
+build gen/draw_root.hlsl: hlsl gen/draw_root.spv
+build gen/draw_root.dxil: dxil gen/draw_root.hlsl
+build gen/draw_root.msl: msl gen/draw_root.spv
+
+build gen/draw_leaf.spv: glsl draw_leaf.comp | scene.h drawtag.h setup.h mem.h
+build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv
+build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl
+build gen/draw_leaf.msl: msl gen/draw_leaf.spv
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index a0ad82a..a2071ad 100644
--- a/piet-gpu/shader/coarse.spv
+++ b/piet-gpu/shader/coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp
new file mode 100644
index 0000000..ec6a928
--- /dev/null
+++ b/piet-gpu/shader/draw_leaf.comp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The leaf scan pass for draw tag scan implemented as a tree reduction.
+// This stage can be fused with its consumer but is separate now.
+
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "scene.h"
+#include "tile.h"
+#include "drawtag.h"
+
+#define Monoid DrawMonoid
+
+layout(set = 0, binding = 3) readonly buffer ParentBuf {
+    Monoid[] parent;
+};
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    Monoid local[N_ROWS];
+
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+    ElementRef ref = ElementRef(ix * Element_size);
+    uint tag_word = Element_tag(ref).tag;
+
+    Monoid agg = map_tag(tag_word);
+    local[0] = agg;
+    for (uint i = 1; i < N_ROWS; i++) {
+        tag_word = Element_tag(Element_index(ref, i)).tag;
+        agg = combine_tag_monoid(agg, map_tag(tag_word));
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_tag_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    
+    barrier();
+    Monoid row = tag_monoid_identity();
+    if (gl_WorkGroupID.x > 0) {
+        row = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        row = combine_tag_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+    }
+    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + gl_GlobalInvocationID.x * 2 * N_ROWS;
+    for (uint i = 0; i < N_ROWS; i++) {
+        Monoid m = combine_tag_monoid(row, local[i]);
+        memory[out_base + i * 2] = m.path_ix;
+        memory[out_base + i * 2 + 1] = m.clip_ix;
+    }
+}
diff --git a/piet-gpu/shader/draw_reduce.comp b/piet-gpu/shader/draw_reduce.comp
new file mode 100644
index 0000000..fe9ab2c
--- /dev/null
+++ b/piet-gpu/shader/draw_reduce.comp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The reduction phase for draw scan implemented as a tree reduction.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+#include "scene.h"
+#include "drawtag.h"
+
+#define Monoid DrawMonoid
+
+layout(set = 0, binding = 3) buffer OutBuf {
+    Monoid[] outbuf;
+};
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+    ElementRef ref = ElementRef(ix * Element_size);
+    uint tag_word = Element_tag(ref).tag;
+
+    Monoid agg = map_tag(tag_word);
+    for (uint i = 1; i < N_ROWS; i++) {
+        tag_word = Element_tag(Element_index(ref, i)).tag;
+        agg = combine_tag_monoid(agg, map_tag(tag_word));
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        // We could make this predicate tighter, but would it help?
+        if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
+            agg = combine_tag_monoid(agg, other);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0) {
+        outbuf[gl_WorkGroupID.x] = agg;
+    }
+}
diff --git a/piet-gpu/shader/draw_scan.comp b/piet-gpu/shader/draw_scan.comp
new file mode 100644
index 0000000..d883671
--- /dev/null
+++ b/piet-gpu/shader/draw_scan.comp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// A scan pass for draw tag scan implemented as a tree reduction.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "drawtag.h"
+
+#define N_ROWS 8
+#define LG_WG_SIZE 9
+#define WG_SIZE (1 << LG_WG_SIZE)
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+#define Monoid DrawMonoid
+#define combine_monoid combine_tag_monoid
+#define monoid_identity tag_monoid_identity
+
+layout(binding = 0) buffer DataBuf {
+    Monoid[] data;
+};
+
+#ifndef ROOT
+layout(binding = 1) readonly buffer ParentBuf {
+    Monoid[] parent;
+};
+#endif
+
+shared Monoid sh_scratch[WG_SIZE];
+
+void main() {
+    Monoid local[N_ROWS];
+
+    uint ix = gl_GlobalInvocationID.x * N_ROWS;
+
+    local[0] = data[ix];
+    for (uint i = 1; i < N_ROWS; i++) {
+        local[i] = combine_monoid(local[i - 1], data[ix + i]);
+    }
+    Monoid agg = local[N_ROWS - 1];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1u << i)) {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
+            agg = combine_monoid(other, agg);
+        }
+        barrier();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    
+    barrier();
+    // This could be a semigroup instead of a monoid if we reworked the
+    // conditional logic, but that might impact performance.
+    Monoid row = monoid_identity();
+#ifdef ROOT
+    if (gl_LocalInvocationID.x > 0) {
+        row = sh_scratch[gl_LocalInvocationID.x - 1];
+    }
+#else
+    if (gl_WorkGroupID.x > 0) {
+        row = parent[gl_WorkGroupID.x - 1];
+    }
+    if (gl_LocalInvocationID.x > 0) {
+        row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
+    }
+#endif
+    for (uint i = 0; i < N_ROWS; i++) {
+        Monoid m = combine_monoid(row, local[i]);
+        data[ix + i] = m;
+    }
+}
diff --git a/piet-gpu/shader/drawtag.h b/piet-gpu/shader/drawtag.h
new file mode 100644
index 0000000..a9e8a1d
--- /dev/null
+++ b/piet-gpu/shader/drawtag.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Common data structures and functions for the draw tag stream.
+
+struct DrawMonoid {
+    uint path_ix;
+    uint clip_ix;
+};
+
+DrawMonoid tag_monoid_identity() {
+    return DrawMonoid(0, 0);
+}
+
+DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b) {
+    DrawMonoid c;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    return c;
+}
+
+#ifdef Element_size
+DrawMonoid map_tag(uint tag_word) {
+    switch (tag_word) {
+    case Element_FillColor:
+    case Element_FillLinGradient:
+    case Element_FillImage:
+        return DrawMonoid(1, 0);
+    case Element_BeginClip:
+        return DrawMonoid(1, 1);
+    case Element_EndClip:
+        return DrawMonoid(0, 1);
+    default:
+        return DrawMonoid(0, 0);
+    }
+}
+#endif
diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl
index ae40b13..7a4e86a 100644
--- a/piet-gpu/shader/gen/bbox_clear.hlsl
+++ b/piet-gpu/shader/gen/bbox_clear.hlsl
@@ -16,6 +16,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl
index f424448..6f73531 100644
--- a/piet-gpu/shader/gen/bbox_clear.msl
+++ b/piet-gpu/shader/gen/bbox_clear.msl
@@ -21,6 +21,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv
index 181f99b..2b659f4 100644
--- a/piet-gpu/shader/gen/bbox_clear.spv
+++ b/piet-gpu/shader/gen/bbox_clear.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil
new file mode 100644
index 0000000..17bace7
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl
new file mode 100644
index 0000000..e5f50fd
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.hlsl
@@ -0,0 +1,190 @@
+struct ElementRef
+{
+    uint offset;
+};
+
+struct ElementTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const DrawMonoid _67 = { 0u, 0u };
+static const DrawMonoid _94 = { 1u, 0u };
+static const DrawMonoid _96 = { 1u, 1u };
+static const DrawMonoid _98 = { 0u, 1u };
+
+ByteAddressBuffer _49 : register(t2);
+ByteAddressBuffer _218 : register(t3);
+ByteAddressBuffer _248 : register(t1);
+RWByteAddressBuffer _277 : register(u0);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared DrawMonoid sh_scratch[512];
+
+ElementTag Element_tag(ElementRef ref)
+{
+    uint tag_and_flags = _49.Load((ref.offset >> uint(2)) * 4 + 0);
+    ElementTag _63 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _63;
+}
+
+DrawMonoid map_tag(uint tag_word)
+{
+    switch (tag_word)
+    {
+        case 4u:
+        case 5u:
+        case 6u:
+        {
+            return _94;
+        }
+        case 9u:
+        {
+            return _96;
+        }
+        case 10u:
+        {
+            return _98;
+        }
+        default:
+        {
+            return _67;
+        }
+    }
+}
+
+ElementRef Element_index(ElementRef ref, uint index)
+{
+    ElementRef _42 = { ref.offset + (index * 36u) };
+    return _42;
+}
+
+DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+{
+    DrawMonoid c;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    return c;
+}
+
+DrawMonoid tag_monoid_identity()
+{
+    return _67;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    ElementRef _115 = { ix * 36u };
+    ElementRef ref = _115;
+    ElementRef param = ref;
+    uint tag_word = Element_tag(param).tag;
+    uint param_1 = tag_word;
+    DrawMonoid agg = map_tag(param_1);
+    DrawMonoid local[8];
+    local[0] = agg;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        ElementRef param_2 = ref;
+        uint param_3 = i;
+        ElementRef param_4 = Element_index(param_2, param_3);
+        tag_word = Element_tag(param_4).tag;
+        uint param_5 = tag_word;
+        DrawMonoid param_6 = agg;
+        DrawMonoid param_7 = map_tag(param_5);
+        agg = combine_tag_monoid(param_6, param_7);
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            DrawMonoid param_8 = other;
+            DrawMonoid param_9 = agg;
+            agg = combine_tag_monoid(param_8, param_9);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    DrawMonoid row = tag_monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        DrawMonoid _224;
+        _224.path_ix = _218.Load((gl_WorkGroupID.x - 1u) * 8 + 0);
+        _224.clip_ix = _218.Load((gl_WorkGroupID.x - 1u) * 8 + 4);
+        row.path_ix = _224.path_ix;
+        row.clip_ix = _224.clip_ix;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        DrawMonoid param_10 = row;
+        DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_tag_monoid(param_10, param_11);
+    }
+    uint out_base = (_248.Load(44) >> uint(2)) + ((gl_GlobalInvocationID.x * 2u) * 8u);
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        DrawMonoid param_12 = row;
+        DrawMonoid param_13 = local[i_2];
+        DrawMonoid m = combine_tag_monoid(param_12, param_13);
+        _277.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix);
+        _277.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl
new file mode 100644
index 0000000..d52a560
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.msl
@@ -0,0 +1,235 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct ElementRef
+{
+    uint offset;
+};
+
+struct ElementTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct DrawMonoid_1
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+struct ParentBuf
+{
+    DrawMonoid_1 parent[1];
+};
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_49)
+{
+    uint tag_and_flags = v_49.scene[ref.offset >> uint(2)];
+    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid map_tag(thread const uint& tag_word)
+{
+    switch (tag_word)
+    {
+        case 4u:
+        case 5u:
+        case 6u:
+        {
+            return DrawMonoid{ 1u, 0u };
+        }
+        case 9u:
+        {
+            return DrawMonoid{ 1u, 1u };
+        }
+        case 10u:
+        {
+            return DrawMonoid{ 0u, 1u };
+        }
+        default:
+        {
+            return DrawMonoid{ 0u, 0u };
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+ElementRef Element_index(thread const ElementRef& ref, thread const uint& index)
+{
+    return ElementRef{ ref.offset + (index * 36u) };
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+{
+    DrawMonoid c;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid tag_monoid_identity()
+{
+    return DrawMonoid{ 0u, 0u };
+}
+
+kernel void main0(device Memory& _277 [[buffer(0)]], const device ConfigBuf& _248 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], const device ParentBuf& _218 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup DrawMonoid sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    ElementRef ref = ElementRef{ ix * 36u };
+    ElementRef param = ref;
+    uint tag_word = Element_tag(param, v_49).tag;
+    uint param_1 = tag_word;
+    DrawMonoid agg = map_tag(param_1);
+    spvUnsafeArray<DrawMonoid, 8> local;
+    local[0] = agg;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        ElementRef param_2 = ref;
+        uint param_3 = i;
+        ElementRef param_4 = Element_index(param_2, param_3);
+        tag_word = Element_tag(param_4, v_49).tag;
+        uint param_5 = tag_word;
+        DrawMonoid param_6 = agg;
+        DrawMonoid param_7 = map_tag(param_5);
+        agg = combine_tag_monoid(param_6, param_7);
+        local[i] = agg;
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            DrawMonoid param_8 = other;
+            DrawMonoid param_9 = agg;
+            agg = combine_tag_monoid(param_8, param_9);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    DrawMonoid row = tag_monoid_identity();
+    if (gl_WorkGroupID.x > 0u)
+    {
+        uint _221 = gl_WorkGroupID.x - 1u;
+        row.path_ix = _218.parent[_221].path_ix;
+        row.clip_ix = _218.parent[_221].clip_ix;
+    }
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        DrawMonoid param_10 = row;
+        DrawMonoid param_11 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        row = combine_tag_monoid(param_10, param_11);
+    }
+    uint out_base = (_248.conf.drawmonoid_alloc.offset >> uint(2)) + ((gl_GlobalInvocationID.x * 2u) * 8u);
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        DrawMonoid param_12 = row;
+        DrawMonoid param_13 = local[i_2];
+        DrawMonoid m = combine_tag_monoid(param_12, param_13);
+        _277.memory[out_base + (i_2 * 2u)] = m.path_ix;
+        _277.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix;
+    }
+}
+
diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv
new file mode 100644
index 0000000..30740a2
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil
new file mode 100644
index 0000000..f1e48e1
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl
new file mode 100644
index 0000000..27c206a
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.hlsl
@@ -0,0 +1,162 @@
+struct ElementRef
+{
+    uint offset;
+};
+
+struct ElementTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+static const DrawMonoid _88 = { 1u, 0u };
+static const DrawMonoid _90 = { 1u, 1u };
+static const DrawMonoid _92 = { 0u, 1u };
+static const DrawMonoid _94 = { 0u, 0u };
+
+ByteAddressBuffer _46 : register(t2);
+RWByteAddressBuffer _203 : register(u3);
+RWByteAddressBuffer _217 : register(u0);
+ByteAddressBuffer _223 : register(t1);
+
+static uint3 gl_WorkGroupID;
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_WorkGroupID : SV_GroupID;
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared DrawMonoid sh_scratch[512];
+
+ElementTag Element_tag(ElementRef ref)
+{
+    uint tag_and_flags = _46.Load((ref.offset >> uint(2)) * 4 + 0);
+    ElementTag _60 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+    return _60;
+}
+
+DrawMonoid map_tag(uint tag_word)
+{
+    switch (tag_word)
+    {
+        case 4u:
+        case 5u:
+        case 6u:
+        {
+            return _88;
+        }
+        case 9u:
+        {
+            return _90;
+        }
+        case 10u:
+        {
+            return _92;
+        }
+        default:
+        {
+            return _94;
+        }
+    }
+}
+
+ElementRef Element_index(ElementRef ref, uint index)
+{
+    ElementRef _39 = { ref.offset + (index * 36u) };
+    return _39;
+}
+
+DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+{
+    DrawMonoid c;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    return c;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    ElementRef _110 = { ix * 36u };
+    ElementRef ref = _110;
+    ElementRef param = ref;
+    uint tag_word = Element_tag(param).tag;
+    uint param_1 = tag_word;
+    DrawMonoid agg = map_tag(param_1);
+    for (uint i = 1u; i < 8u; i++)
+    {
+        ElementRef param_2 = ref;
+        uint param_3 = i;
+        ElementRef param_4 = Element_index(param_2, param_3);
+        tag_word = Element_tag(param_4).tag;
+        uint param_5 = tag_word;
+        DrawMonoid param_6 = agg;
+        DrawMonoid param_7 = map_tag(param_5);
+        agg = combine_tag_monoid(param_6, param_7);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+        {
+            DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            DrawMonoid param_8 = agg;
+            DrawMonoid param_9 = other;
+            agg = combine_tag_monoid(param_8, param_9);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _203.Store(gl_WorkGroupID.x * 8 + 0, agg.path_ix);
+        _203.Store(gl_WorkGroupID.x * 8 + 4, agg.clip_ix);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_WorkGroupID = stage_input.gl_WorkGroupID;
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl
new file mode 100644
index 0000000..dd2f517
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.msl
@@ -0,0 +1,169 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct ElementRef
+{
+    uint offset;
+};
+
+struct ElementTag
+{
+    uint tag;
+    uint flags;
+};
+
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+struct SceneBuf
+{
+    uint scene[1];
+};
+
+struct DrawMonoid_1
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+struct OutBuf
+{
+    DrawMonoid_1 outbuf[1];
+};
+
+struct Memory
+{
+    uint mem_offset;
+    uint mem_error;
+    uint memory[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+struct Alloc
+{
+    uint offset;
+};
+
+struct Config
+{
+    uint n_elements;
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+    Alloc trans_alloc;
+    Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
+    uint n_trans;
+    uint trans_offset;
+    uint pathtag_offset;
+    uint linewidth_offset;
+    uint pathseg_offset;
+};
+
+struct ConfigBuf
+{
+    Config conf;
+};
+
+static inline __attribute__((always_inline))
+ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_46)
+{
+    uint tag_and_flags = v_46.scene[ref.offset >> uint(2)];
+    return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid map_tag(thread const uint& tag_word)
+{
+    switch (tag_word)
+    {
+        case 4u:
+        case 5u:
+        case 6u:
+        {
+            return DrawMonoid{ 1u, 0u };
+        }
+        case 9u:
+        {
+            return DrawMonoid{ 1u, 1u };
+        }
+        case 10u:
+        {
+            return DrawMonoid{ 0u, 1u };
+        }
+        default:
+        {
+            return DrawMonoid{ 0u, 0u };
+        }
+    }
+}
+
+static inline __attribute__((always_inline))
+ElementRef Element_index(thread const ElementRef& ref, thread const uint& index)
+{
+    return ElementRef{ ref.offset + (index * 36u) };
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+{
+    DrawMonoid c;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    return c;
+}
+
+kernel void main0(const device SceneBuf& v_46 [[buffer(2)]], device OutBuf& _203 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
+{
+    threadgroup DrawMonoid sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    ElementRef ref = ElementRef{ ix * 36u };
+    ElementRef param = ref;
+    uint tag_word = Element_tag(param, v_46).tag;
+    uint param_1 = tag_word;
+    DrawMonoid agg = map_tag(param_1);
+    for (uint i = 1u; i < 8u; i++)
+    {
+        ElementRef param_2 = ref;
+        uint param_3 = i;
+        ElementRef param_4 = Element_index(param_2, param_3);
+        tag_word = Element_tag(param_4, v_46).tag;
+        uint param_5 = tag_word;
+        DrawMonoid param_6 = agg;
+        DrawMonoid param_7 = map_tag(param_5);
+        agg = combine_tag_monoid(param_6, param_7);
+    }
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
+        {
+            DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
+            DrawMonoid param_8 = agg;
+            DrawMonoid param_9 = other;
+            agg = combine_tag_monoid(param_8, param_9);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        _203.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix;
+        _203.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix;
+    }
+}
+
diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv
new file mode 100644
index 0000000..286bd33
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil
new file mode 100644
index 0000000..da5cfe2
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl
new file mode 100644
index 0000000..7dc68b1
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.hlsl
@@ -0,0 +1,94 @@
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const DrawMonoid _18 = { 0u, 0u };
+
+RWByteAddressBuffer _57 : register(u0);
+
+static uint3 gl_LocalInvocationID;
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+groupshared DrawMonoid sh_scratch[512];
+
+DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
+{
+    DrawMonoid c;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    return c;
+}
+
+DrawMonoid tag_monoid_identity()
+{
+    return _18;
+}
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    DrawMonoid _61;
+    _61.path_ix = _57.Load(ix * 8 + 0);
+    _61.clip_ix = _57.Load(ix * 8 + 4);
+    DrawMonoid local[8];
+    local[0].path_ix = _61.path_ix;
+    local[0].clip_ix = _61.clip_ix;
+    DrawMonoid param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        DrawMonoid param = local[i - 1u];
+        DrawMonoid _88;
+        _88.path_ix = _57.Load((ix + i) * 8 + 0);
+        _88.clip_ix = _57.Load((ix + i) * 8 + 4);
+        param_1.path_ix = _88.path_ix;
+        param_1.clip_ix = _88.clip_ix;
+        local[i] = combine_tag_monoid(param, param_1);
+    }
+    DrawMonoid agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            DrawMonoid param_2 = other;
+            DrawMonoid param_3 = agg;
+            agg = combine_tag_monoid(param_2, param_3);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    DrawMonoid row = tag_monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        DrawMonoid param_4 = row;
+        DrawMonoid param_5 = local[i_2];
+        DrawMonoid m = combine_tag_monoid(param_4, param_5);
+        uint _178 = ix + i_2;
+        _57.Store(_178 * 8 + 0, m.path_ix);
+        _57.Store(_178 * 8 + 4, m.clip_ix);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl
new file mode 100644
index 0000000..2ed7ba2
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.msl
@@ -0,0 +1,128 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct DrawMonoid
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+struct DrawMonoid_1
+{
+    uint path_ix;
+    uint clip_ix;
+};
+
+struct DataBuf
+{
+    DrawMonoid_1 data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
+{
+    DrawMonoid c;
+    c.path_ix = a.path_ix + b.path_ix;
+    c.clip_ix = a.clip_ix + b.clip_ix;
+    return c;
+}
+
+static inline __attribute__((always_inline))
+DrawMonoid tag_monoid_identity()
+{
+    return DrawMonoid{ 0u, 0u };
+}
+
+kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup DrawMonoid sh_scratch[512];
+    uint ix = gl_GlobalInvocationID.x * 8u;
+    spvUnsafeArray<DrawMonoid, 8> local;
+    local[0].path_ix = _57.data[ix].path_ix;
+    local[0].clip_ix = _57.data[ix].clip_ix;
+    DrawMonoid param_1;
+    for (uint i = 1u; i < 8u; i++)
+    {
+        uint _82 = ix + i;
+        DrawMonoid param = local[i - 1u];
+        param_1.path_ix = _57.data[_82].path_ix;
+        param_1.clip_ix = _57.data[_82].clip_ix;
+        local[i] = combine_tag_monoid(param, param_1);
+    }
+    DrawMonoid agg = local[7];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            DrawMonoid param_2 = other;
+            DrawMonoid param_3 = agg;
+            agg = combine_tag_monoid(param_2, param_3);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    DrawMonoid row = tag_monoid_identity();
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        row = sh_scratch[gl_LocalInvocationID.x - 1u];
+    }
+    for (uint i_2 = 0u; i_2 < 8u; i_2++)
+    {
+        DrawMonoid param_4 = row;
+        DrawMonoid param_5 = local[i_2];
+        DrawMonoid m = combine_tag_monoid(param_4, param_5);
+        uint _178 = ix + i_2;
+        _57.data[_178].path_ix = m.path_ix;
+        _57.data[_178].clip_ix = m.clip_ix;
+    }
+}
+
diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv
new file mode 100644
index 0000000..acecee3
--- /dev/null
+++ b/piet-gpu/shader/gen/draw_root.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil
index 5ad35e7..4464d9d 100644
--- a/piet-gpu/shader/gen/pathseg.dxil
+++ b/piet-gpu/shader/gen/pathseg.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl
index 40e60cd..e29ddd3 100644
--- a/piet-gpu/shader/gen/pathseg.hlsl
+++ b/piet-gpu/shader/gen/pathseg.hlsl
@@ -63,6 +63,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
@@ -354,7 +355,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 4u;
-    uint tag_word = _574.Load(((_639.Load(52) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
+    uint tag_word = _574.Load(((_639.Load(56) >> uint(2)) + (ix >> uint(2))) * 4 + 0);
     uint param = tag_word;
     TagMonoid local_tm = reduce_tag(param);
     sh_tag[gl_LocalInvocationID.x] = local_tm;
@@ -393,13 +394,13 @@
         TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u];
         tm = combine_tag_monoid(param_3, param_4);
     }
-    uint ps_ix = (_639.Load(60) >> uint(2)) + tm.pathseg_offset;
-    uint lw_ix = (_639.Load(56) >> uint(2)) + tm.linewidth_ix;
+    uint ps_ix = (_639.Load(64) >> uint(2)) + tm.pathseg_offset;
+    uint lw_ix = (_639.Load(60) >> uint(2)) + tm.linewidth_ix;
     uint save_path_ix = tm.path_ix;
-    TransformSegRef _769 = { _639.Load(36) + (tm.trans_ix * 24u) };
-    TransformSegRef trans_ref = _769;
-    PathSegRef _779 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
-    PathSegRef ps_ref = _779;
+    TransformSegRef _768 = { _639.Load(36) + (tm.trans_ix * 24u) };
+    TransformSegRef trans_ref = _768;
+    PathSegRef _778 = { _639.Load(28) + (tm.pathseg_ix * 52u) };
+    PathSegRef ps_ref = _778;
     float2 p0;
     float2 p1;
     float2 p2;
@@ -449,9 +450,9 @@
                 }
             }
             float linewidth = asfloat(_574.Load(lw_ix * 4 + 0));
-            Alloc _865;
-            _865.offset = _639.Load(36);
-            param_13.offset = _865.offset;
+            Alloc _864;
+            _864.offset = _639.Load(36);
+            param_13.offset = _864.offset;
             TransformSegRef param_14 = trans_ref;
             TransformSeg transform = TransformSeg_read(param_13, param_14);
             p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate;
@@ -460,25 +461,25 @@
             if (seg_type >= 2u)
             {
                 p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _935 = bbox;
-                float2 _938 = min(_935.xy, p2);
-                bbox.x = _938.x;
-                bbox.y = _938.y;
-                float4 _943 = bbox;
-                float2 _946 = max(_943.zw, p2);
-                bbox.z = _946.x;
-                bbox.w = _946.y;
+                float4 _934 = bbox;
+                float2 _937 = min(_934.xy, p2);
+                bbox.x = _937.x;
+                bbox.y = _937.y;
+                float4 _942 = bbox;
+                float2 _945 = max(_942.zw, p2);
+                bbox.z = _945.x;
+                bbox.w = _945.y;
                 if (seg_type == 3u)
                 {
                     p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _971 = bbox;
-                    float2 _974 = min(_971.xy, p3);
-                    bbox.x = _974.x;
-                    bbox.y = _974.y;
-                    float4 _979 = bbox;
-                    float2 _982 = max(_979.zw, p3);
-                    bbox.z = _982.x;
-                    bbox.w = _982.y;
+                    float4 _970 = bbox;
+                    float2 _973 = min(_970.xy, p3);
+                    bbox.x = _973.x;
+                    bbox.y = _973.y;
+                    float4 _978 = bbox;
+                    float2 _981 = max(_978.zw, p3);
+                    bbox.z = _981.x;
+                    bbox.w = _981.y;
                 }
                 else
                 {
@@ -509,9 +510,9 @@
             cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1;
             cubic.stroke = stroke;
             uint fill_mode = uint(linewidth >= 0.0f);
-            Alloc _1071;
-            _1071.offset = _639.Load(28);
-            param_15.offset = _1071.offset;
+            Alloc _1070;
+            _1070.offset = _639.Load(28);
+            param_15.offset = _1070.offset;
             PathSegRef param_16 = ps_ref;
             uint param_17 = fill_mode;
             PathCubic param_18 = cubic;
@@ -567,17 +568,17 @@
         Monoid param_24 = local[i_4];
         Monoid m = combine_monoid(param_23, param_24);
         bool do_atomic = false;
-        bool _1241 = i_4 == 3u;
-        bool _1248;
-        if (_1241)
+        bool _1240 = i_4 == 3u;
+        bool _1247;
+        if (_1240)
         {
-            _1248 = gl_LocalInvocationID.x == 511u;
+            _1247 = gl_LocalInvocationID.x == 511u;
         }
         else
         {
-            _1248 = _1241;
+            _1247 = _1240;
         }
-        if (_1248)
+        if (_1247)
         {
             do_atomic = true;
         }
@@ -603,30 +604,30 @@
         }
         if (do_atomic)
         {
-            bool _1300 = m.bbox.z > m.bbox.x;
-            bool _1309;
-            if (!_1300)
+            bool _1299 = m.bbox.z > m.bbox.x;
+            bool _1308;
+            if (!_1299)
             {
-                _1309 = m.bbox.w > m.bbox.y;
+                _1308 = m.bbox.w > m.bbox.y;
             }
             else
             {
-                _1309 = _1300;
+                _1308 = _1299;
             }
-            if (_1309)
+            if (_1308)
             {
                 float param_29 = m.bbox.x;
-                uint _1318;
-                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1318);
+                uint _1317;
+                _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1317);
                 float param_30 = m.bbox.y;
-                uint _1326;
-                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1326);
+                uint _1325;
+                _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1325);
                 float param_31 = m.bbox.z;
-                uint _1334;
-                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1334);
+                uint _1333;
+                _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1333);
                 float param_32 = m.bbox.w;
-                uint _1342;
-                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1342);
+                uint _1341;
+                _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1341);
             }
             bbox_out_ix += 4u;
         }
diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl
index 25d001f..71299bd 100644
--- a/piet-gpu/shader/gen/pathseg.msl
+++ b/piet-gpu/shader/gen/pathseg.msl
@@ -128,6 +128,7 @@
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
     Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
@@ -530,25 +531,25 @@
             if (seg_type >= 2u)
             {
                 p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate;
-                float4 _935 = bbox;
-                float2 _938 = fast::min(_935.xy, p2);
-                bbox.x = _938.x;
-                bbox.y = _938.y;
-                float4 _943 = bbox;
-                float2 _946 = fast::max(_943.zw, p2);
-                bbox.z = _946.x;
-                bbox.w = _946.y;
+                float4 _934 = bbox;
+                float2 _937 = fast::min(_934.xy, p2);
+                bbox.x = _937.x;
+                bbox.y = _937.y;
+                float4 _942 = bbox;
+                float2 _945 = fast::max(_942.zw, p2);
+                bbox.z = _945.x;
+                bbox.w = _945.y;
                 if (seg_type == 3u)
                 {
                     p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate;
-                    float4 _971 = bbox;
-                    float2 _974 = fast::min(_971.xy, p3);
-                    bbox.x = _974.x;
-                    bbox.y = _974.y;
-                    float4 _979 = bbox;
-                    float2 _982 = fast::max(_979.zw, p3);
-                    bbox.z = _982.x;
-                    bbox.w = _982.y;
+                    float4 _970 = bbox;
+                    float2 _973 = fast::min(_970.xy, p3);
+                    bbox.x = _973.x;
+                    bbox.y = _973.y;
+                    float4 _978 = bbox;
+                    float2 _981 = fast::max(_978.zw, p3);
+                    bbox.z = _981.x;
+                    bbox.w = _981.y;
                 }
                 else
                 {
@@ -635,17 +636,17 @@
         Monoid param_24 = local[i_4];
         Monoid m = combine_monoid(param_23, param_24);
         bool do_atomic = false;
-        bool _1241 = i_4 == 3u;
-        bool _1248;
-        if (_1241)
+        bool _1240 = i_4 == 3u;
+        bool _1247;
+        if (_1240)
         {
-            _1248 = gl_LocalInvocationID.x == 511u;
+            _1247 = gl_LocalInvocationID.x == 511u;
         }
         else
         {
-            _1248 = _1241;
+            _1247 = _1240;
         }
-        if (_1248)
+        if (_1247)
         {
             do_atomic = true;
         }
@@ -671,26 +672,26 @@
         }
         if (do_atomic)
         {
-            bool _1300 = m.bbox.z > m.bbox.x;
-            bool _1309;
-            if (!_1300)
+            bool _1299 = m.bbox.z > m.bbox.x;
+            bool _1308;
+            if (!_1299)
             {
-                _1309 = m.bbox.w > m.bbox.y;
+                _1308 = m.bbox.w > m.bbox.y;
             }
             else
             {
-                _1309 = _1300;
+                _1308 = _1299;
             }
-            if (_1309)
+            if (_1308)
             {
                 float param_29 = m.bbox.x;
-                uint _1318 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
+                uint _1317 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed);
                 float param_30 = m.bbox.y;
-                uint _1326 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
+                uint _1325 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed);
                 float param_31 = m.bbox.z;
-                uint _1334 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
+                uint _1333 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed);
                 float param_32 = m.bbox.w;
-                uint _1342 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
+                uint _1341 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed);
             }
             bbox_out_ix += 4u;
         }
diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv
index 2ac684d..bc165ac 100644
--- a/piet-gpu/shader/gen/pathseg.spv
+++ b/piet-gpu/shader/gen/pathseg.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil
index 81448e7..02a4750 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.dxil
+++ b/piet-gpu/shader/gen/pathtag_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl
index 5ed84b8..5e98362 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.hlsl
+++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl
@@ -25,6 +25,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
@@ -81,7 +82,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 4u;
-    uint scene_ix = (_139.Load(52) >> uint(2)) + ix;
+    uint scene_ix = (_139.Load(56) >> uint(2)) + ix;
     uint tag_word = _151.Load(scene_ix * 4 + 0);
     uint param = tag_word;
     TagMonoid agg = reduce_tag(param);
diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl
index edb6d03..38451d4 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.msl
+++ b/piet-gpu/shader/gen/pathtag_reduce.msl
@@ -32,6 +32,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv
index 44cd938..eef46a2 100644
--- a/piet-gpu/shader/gen/pathtag_reduce.spv
+++ b/piet-gpu/shader/gen/pathtag_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil
index 3864dbe..dabc049 100644
--- a/piet-gpu/shader/gen/transform_leaf.dxil
+++ b/piet-gpu/shader/gen/transform_leaf.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl
index c0343f0..2f0de05 100644
--- a/piet-gpu/shader/gen/transform_leaf.hlsl
+++ b/piet-gpu/shader/gen/transform_leaf.hlsl
@@ -38,6 +38,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
@@ -148,7 +149,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _285 = { _278.Load(48) + (ix * 24u) };
+    TransformRef _285 = { _278.Load(52) + (ix * 24u) };
     TransformRef ref = _285;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl
index 16c1e13..3120b3d 100644
--- a/piet-gpu/shader/gen/transform_leaf.msl
+++ b/piet-gpu/shader/gen/transform_leaf.msl
@@ -101,6 +101,7 @@
     Alloc_1 anno_alloc;
     Alloc_1 trans_alloc;
     Alloc_1 bbox_alloc;
+    Alloc_1 drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv
index 49c9789..01f047b 100644
--- a/piet-gpu/shader/gen/transform_leaf.spv
+++ b/piet-gpu/shader/gen/transform_leaf.spv
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil
index f9e1cbf..68997d0 100644
--- a/piet-gpu/shader/gen/transform_reduce.dxil
+++ b/piet-gpu/shader/gen/transform_reduce.dxil
Binary files differ
diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl
index 75e7e3f..9d8a5d6 100644
--- a/piet-gpu/shader/gen/transform_reduce.hlsl
+++ b/piet-gpu/shader/gen/transform_reduce.hlsl
@@ -27,6 +27,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
@@ -85,7 +86,7 @@
 void comp_main()
 {
     uint ix = gl_GlobalInvocationID.x * 8u;
-    TransformRef _168 = { _161.Load(48) + (ix * 24u) };
+    TransformRef _168 = { _161.Load(52) + (ix * 24u) };
     TransformRef ref = _168;
     TransformRef param = ref;
     Transform agg = Transform_read(param);
diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl
index aabfaed..e61b602 100644
--- a/piet-gpu/shader/gen/transform_reduce.msl
+++ b/piet-gpu/shader/gen/transform_reduce.msl
@@ -39,6 +39,7 @@
     Alloc anno_alloc;
     Alloc trans_alloc;
     Alloc bbox_alloc;
+    Alloc drawmonoid_alloc;
     uint n_trans;
     uint trans_offset;
     uint pathtag_offset;
diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv
index 451775d..77eadb2 100644
--- a/piet-gpu/shader/gen/transform_reduce.spv
+++ b/piet-gpu/shader/gen/transform_reduce.spv
Binary files differ
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 4db2c3a..04b6364 100644
--- a/piet-gpu/shader/kernel4.spv
+++ b/piet-gpu/shader/kernel4.spv
Binary files differ
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index 2fc59fe..240f8f7 100644
--- a/piet-gpu/shader/path_coarse.spv
+++ b/piet-gpu/shader/path_coarse.spv
Binary files differ
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index c74903e..3bb1fdd 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -42,6 +42,8 @@
 
     // Bounding boxes of paths, stored as int (so atomics work)
     Alloc bbox_alloc;
+    // Monoid for draw objects
+    Alloc drawmonoid_alloc;
 
     // Number of transforms in scene
     // This is probably not needed.
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
index 69dddf5..0de00e3 100644
--- a/piet-gpu/shader/tile_alloc.spv
+++ b/piet-gpu/shader/tile_alloc.spv
Binary files differ
diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs
index 59e8b50..f4a086c 100644
--- a/piet-gpu/src/stages.rs
+++ b/piet-gpu/src/stages.rs
@@ -16,11 +16,13 @@
 
 //! Stages for new element pipeline, exposed for testing.
 
+mod draw;
 mod path;
 mod transform;
 
 use bytemuck::{Pod, Zeroable};
 
+pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage};
 pub use path::{PathBinding, PathCode, PathEncoder, PathStage};
 pub use transform::{Transform, TransformBinding, TransformCode, TransformStage};
 
@@ -41,6 +43,7 @@
     pub anno_alloc: u32,
     pub trans_alloc: u32,
     pub bbox_alloc: u32,
+    pub drawmonoid_alloc: u32,
     pub n_trans: u32,
     pub trans_offset: u32,
     pub pathtag_offset: u32,
diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs
new file mode 100644
index 0000000..d50c6cb
--- /dev/null
+++ b/piet-gpu/src/stages/draw.rs
@@ -0,0 +1,163 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! The draw object stage of the element processing pipeline.
+
+use bytemuck::{Pod, Zeroable};
+
+use piet_gpu_hal::{
+    include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
+};
+
+/// The output element of the draw object stage.
+#[repr(C)]
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Zeroable, Pod)]
+pub struct DrawMonoid {
+    pub path_ix: u32,
+    pub clip_ix: u32,
+}
+
+const DRAW_WG: u64 = 512;
+const DRAW_N_ROWS: u64 = 8;
+const DRAW_PART_SIZE: u64 = DRAW_WG * DRAW_N_ROWS;
+
+pub struct DrawCode {
+    reduce_pipeline: Pipeline,
+    root_pipeline: Pipeline,
+    leaf_pipeline: Pipeline,
+}
+pub struct DrawStage {
+    // Right now we're limited to partition^2 (~16M) elements. This can be
+    // expanded but is tedious.
+    root_buf: Buffer,
+    root_ds: DescriptorSet,
+}
+
+pub struct DrawBinding {
+    reduce_ds: DescriptorSet,
+    leaf_ds: DescriptorSet,
+}
+
+impl DrawCode {
+    pub unsafe fn new(session: &Session) -> DrawCode {
+        let reduce_code = include_shader!(session, "../../shader/gen/draw_reduce");
+        let reduce_pipeline = session
+            .create_compute_pipeline(
+                reduce_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::Buffer,
+                ],
+            )
+            .unwrap();
+        let root_code = include_shader!(session, "../../shader/gen/draw_root");
+        let root_pipeline = session
+            .create_compute_pipeline(root_code, &[BindType::Buffer])
+            .unwrap();
+        let leaf_code = include_shader!(session, "../../shader/gen/draw_leaf");
+        let leaf_pipeline = session
+            .create_compute_pipeline(
+                leaf_code,
+                &[
+                    BindType::Buffer,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                    BindType::BufReadOnly,
+                ],
+            )
+            .unwrap();
+        DrawCode {
+            reduce_pipeline,
+            root_pipeline,
+            leaf_pipeline,
+        }
+    }
+}
+
+impl DrawStage {
+    pub unsafe fn new(session: &Session, code: &DrawCode) -> DrawStage {
+        // We're limited to DRAW_PART_SIZE^2
+        // Also note: size here allows padding
+        let root_buf_size = DRAW_PART_SIZE * 8;
+        let root_buf = session
+            .create_buffer(root_buf_size, BufferUsage::STORAGE)
+            .unwrap();
+        let root_ds = session
+            .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf])
+            .unwrap();
+        DrawStage { root_buf, root_ds }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        session: &Session,
+        code: &DrawCode,
+        config_buf: &Buffer,
+        scene_buf: &Buffer,
+        memory_buf: &Buffer,
+    ) -> DrawBinding {
+        let reduce_ds = session
+            .create_simple_descriptor_set(
+                &code.reduce_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        let leaf_ds = session
+            .create_simple_descriptor_set(
+                &code.leaf_pipeline,
+                &[memory_buf, config_buf, scene_buf, &self.root_buf],
+            )
+            .unwrap();
+        DrawBinding { reduce_ds, leaf_ds }
+    }
+
+    pub unsafe fn record(
+        &self,
+        cmd_buf: &mut CmdBuf,
+        code: &DrawCode,
+        binding: &DrawBinding,
+        size: u64,
+    ) {
+        if size > DRAW_PART_SIZE.pow(2) {
+            panic!("very large scan not yet implemented");
+        }
+        let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
+        if n_workgroups > 1 {
+            cmd_buf.dispatch(
+                &code.reduce_pipeline,
+                &binding.reduce_ds,
+                (n_workgroups as u32, 1, 1),
+                (DRAW_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+            cmd_buf.dispatch(
+                &code.root_pipeline,
+                &self.root_ds,
+                (1, 1, 1),
+                (DRAW_WG as u32, 1, 1),
+            );
+            cmd_buf.memory_barrier();
+        }
+        cmd_buf.dispatch(
+            &code.leaf_pipeline,
+            &binding.leaf_ds,
+            (n_workgroups as u32, 1, 1),
+            (DRAW_WG as u32, 1, 1),
+        );
+    }
+}
diff --git a/tests/src/draw.rs b/tests/src/draw.rs
new file mode 100644
index 0000000..ca19312
--- /dev/null
+++ b/tests/src/draw.rs
@@ -0,0 +1,147 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Tests for the piet-gpu draw object stage.
+
+use piet_gpu_hal::{BufWrite, BufferUsage};
+use rand::Rng;
+
+use crate::{Config, Runner, TestResult};
+
+use piet_gpu::stages::{self, DrawCode, DrawMonoid, DrawStage};
+
+const ELEMENT_SIZE: usize = 36;
+
+const ELEMENT_FILLCOLOR: u32 = 4;
+const ELEMENT_FILLLINGRADIENT: u32 = 5;
+const ELEMENT_FILLIMAGE: u32 = 6;
+const ELEMENT_BEGINCLIP: u32 = 9;
+const ELEMENT_ENDCLIP: u32 = 10;
+
+struct DrawTestData {
+    tags: Vec<u32>,
+}
+
+pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("draw");
+    let n_tag: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 24);
+    let data = DrawTestData::new(n_tag);
+    let stage_config = data.get_config();
+
+    let config_buf = runner
+        .session
+        .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
+        .unwrap();
+    let scene_size = n_tag * ELEMENT_SIZE as u64;
+    let scene_buf = runner
+        .session
+        .create_buffer_with(scene_size, |b| data.fill_scene(b), BufferUsage::STORAGE)
+        .unwrap();
+    let memory = runner.buf_down(data.memory_size(), BufferUsage::STORAGE);
+
+    let code = DrawCode::new(&runner.session);
+    let stage = DrawStage::new(&runner.session, &code);
+    let binding = stage.bind(
+        &runner.session,
+        &code,
+        &config_buf,
+        &scene_buf,
+        &memory.dev_buf,
+    );
+
+    let mut total_elapsed = 0.0;
+    let n_iter = config.n_iter;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.write_timestamp(0);
+        stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
+        commands.write_timestamp(1);
+        if i == 0 || config.verify_all {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&memory);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 || config.verify_all {
+            let dst = memory.map_read(..);
+            if let Some(failure) = data.verify(&dst) {
+                result.fail(failure);
+            }
+        }
+    }
+    let n_elements = n_tag;
+    result.timing(total_elapsed, n_elements * n_iter);
+
+    result
+}
+
+impl DrawTestData {
+    fn new(n: u64) -> DrawTestData {
+        let mut rng = rand::thread_rng();
+        let tags = (0..n).map(|_| rng.gen_range(0, 12)).collect();
+        DrawTestData { tags }
+    }
+
+    fn get_config(&self) -> stages::Config {
+        let n_tags = self.tags.len();
+
+        // Layout of memory
+        let drawmonoid_alloc = 0;
+        let stage_config = stages::Config {
+            n_elements: n_tags as u32,
+            drawmonoid_alloc,
+            ..Default::default()
+        };
+        stage_config
+    }
+
+    fn memory_size(&self) -> u64 {
+        8 + self.tags.len() as u64 * 8
+    }
+
+    fn fill_scene(&self, buf: &mut BufWrite) {
+        let mut element = [0u32; ELEMENT_SIZE / 4];
+        for tag in &self.tags {
+            element[0] = *tag;
+            buf.push(element);
+        }
+    }
+
+    fn verify(&self, buf: &[u8]) -> Option<String> {
+        let size = self.tags.len() * 8;
+        let actual = bytemuck::cast_slice::<u8, DrawMonoid>(&buf[8..8 + size]);
+        let mut expected = DrawMonoid::default();
+        for (i, (tag, actual)) in self.tags.iter().zip(actual).enumerate() {
+            // We compute an inclusive prefix sum, but for this application
+            // exclusive would be slightly better. We can adapt though.
+            let (path_ix, clip_ix) = Self::reduce_tag(*tag);
+            expected.path_ix += path_ix;
+            expected.clip_ix += clip_ix;
+            if *actual != expected {
+                return Some(format!("draw mismatch at {}", i));
+            }
+        }
+        None
+    }
+
+    fn reduce_tag(tag: u32) -> (u32, u32) {
+        match tag {
+            ELEMENT_FILLCOLOR | ELEMENT_FILLLINGRADIENT | ELEMENT_FILLIMAGE => (1, 0),
+            ELEMENT_BEGINCLIP => (1, 1),
+            ELEMENT_ENDCLIP => (0, 1),
+            _ => (0, 0),
+        }
+    }
+}
diff --git a/tests/src/main.rs b/tests/src/main.rs
index 9aab351..e52ce85 100644
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@@ -18,6 +18,7 @@
 
 mod clear;
 mod config;
+mod draw;
 mod linkedlist;
 mod message_passing;
 mod prefix;
@@ -137,6 +138,7 @@
         if config.groups.matches("piet") {
             report(&transform::transform_test(&mut runner, &config));
             report(&path::path_test(&mut runner, &config));
+            report(&draw::draw_test(&mut runner, &config));
         }
     }
 }