Improve numerical robustness of path filling

Handle various edge cases according to epsilon theory. Also add test SVG file which is useful for hitting those cases.

While this handles all known artifacts, there are some rough edges. For one, area antialiasing still exhibits some artifacts. Another issue is that the bounding box culling is now slightly more lenient on the Rust side than the WGSL one (edges aligned to the left edge of the tile are now included). The WGSL path hasn't been exercised as carefully as the Rust one.
diff --git a/examples/assets/robustness.svg b/examples/assets/robustness.svg
new file mode 100644
index 0000000..2ba1ee6
--- /dev/null
+++ b/examples/assets/robustness.svg
@@ -0,0 +1,8 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1600 1600">
+  <path d="M16 16 h16 v16 h-16" fill="#0f0"/>
+
+  <path d="M48 16 h16 v16" fill="#0f0"/>
+  <path d="M80 16 l16 16 h-16" fill="#08f"/>
+
+  <path d="M116 16 h12 v16 h-8" fill="#0f0"/>
+</svg>
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index f5b6b91..95f5aeb 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -234,14 +234,31 @@
             let segment = segments[seg_off];
             // Note: coords relative to tile origin probably a good idea in coarse path,
             // especially as f16 would work. But keeping existing scheme for compatibility.
-            let xy0_in = segment.origin - tile_origin;
-            let xy1_in = xy0_in + segment.delta;
-            let is_down = xy1_in.y >= xy0_in.y;
-            let xy0 = select(xy1_in, xy0_in, is_down);
-            let xy1 = select(xy0_in, xy1_in, is_down);
-            count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
-            let delta = select(-1, 1, xy1_in.x <= xy0_in.x);
-            let y_edge = u32(ceil(segment.y_edge - tile_origin.y));
+            let xy0 = segment.origin - tile_origin;
+            let xy1 = xy0 + segment.delta;
+            var y_edge_f = f32(TILE_HEIGHT);
+            var delta = select(-1, 1, xy1.x <= xy0.x);
+            if xy0.x == 0.0 && xy1.x == 0.0 {
+                if xy0.y == 0.0 {
+                    y_edge_f = 0.0;
+                } else if xy1.y == 0.0 {
+                    y_edge_f = 0.0;
+                    delta = -delta;
+                }
+            } else {
+                if xy0.x == 0.0 {
+                    if xy0.y != 0.0 {
+                        y_edge_f = xy0.y;
+                    }
+                } else if xy1.x == 0.0 && xy1.y != 0.0 {
+                    y_edge_f = xy1.y;
+                }
+                // discard horizontal lines aligned to pixel grid
+                if !(xy0.y == xy1.y && xy0.y == floor(xy0.y)) {
+                    count = span(xy0.x, xy1.x) + span(xy0.y, xy1.y) - 1u;
+                }
+            }
+            let y_edge = u32(ceil(y_edge_f));
             if y_edge < TILE_HEIGHT {
                 atomicAdd(&sh_winding_y[y_edge >> 3u], u32(delta) << ((y_edge & 7u) << 2u));
             }
@@ -292,8 +309,8 @@
             // One alternative is to compute it in a separate dispatch.
             let dx = abs(xy1.x - xy0.x);
             let dy = xy1.y - xy0.y;
-            let idxdy = 1.0 / (dx + dy);
-            let a = dx * idxdy;
+            let dy_dxdy = dy / (dx + dy);
+            let a = dx / (dx + dy);
             let is_positive_slope = xy1.x >= xy0.x;
             let sign = select(-1.0, 1.0, is_positive_slope);
             let xt0 = floor(xy0.x * sign);
@@ -301,7 +318,7 @@
             // This has a special case in the JS code, but we should just not render
             let y0i = floor(xy0.y);
             let ytop = select(y0i + 1.0, ceil(xy0.y), xy0.y == xy1.y);
-            let b = (dy * c + dx * (ytop - xy0.y)) * idxdy;
+            let b = dy_dxdy * c + a * (ytop - xy0.y);
             let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
             // Use line equation to plot pixel coordinates
 
@@ -331,7 +348,8 @@
             }
             // Apply sample mask
             let mask_block = u32(is_positive_slope) * (MASK_WIDTH * MASK_HEIGHT / 2u);
-            let mask_row = floor(a * f32(MASK_HEIGHT / 2u)) * f32(MASK_WIDTH);
+            let half_height = f32(MASK_HEIGHT / 2u);
+            let mask_row = floor(min(a * half_height, half_height - 1.0)) * f32(MASK_WIDTH);
             let mask_col = floor((zf - z) * f32(MASK_WIDTH));
             let mask_ix = mask_block + u32(mask_row + mask_col);
 #ifdef msaa8
@@ -485,8 +503,8 @@
                 let fill = read_fill(cmd_ix);
                 let n_segs = fill.size_and_rule >> 1u;
                 let even_odd = (fill.size_and_rule & 1u) != 0u;
-                area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd);
-                //area = fill_path_ms(fill.seg_data, n_segs, fill.backdrop, wg_id.xy, local_id.xy, even_odd);
+                //area = fill_path(fill.seg_data, n_segs, fill.backdrop, xy, even_odd);
+                area = fill_path_ms(fill.seg_data, n_segs, fill.backdrop, wg_id.xy, local_id.xy, even_odd);
                 cmd_ix += 4u;
             }
             // CMD_STROKE
diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl
index 0ddc4ce..3cf4e4c 100644
--- a/shader/path_count.wgsl
+++ b/shader/path_count.wgsl
@@ -68,6 +68,9 @@
             // Zero-length segment, drop it. Note, this should be culled earlier.
             return;
         }
+        if dy == 0.0 && floor(s0.y) == s0.y {
+            return;
+        }
         let idxdy = 1.0 / (dx + dy);
         let a = dx * idxdy;
         let is_positive_slope = s1.x >= s0.x;
diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs
index c48cc0c..670cbf0 100644
--- a/src/cpu_shader/path_count.rs
+++ b/src/cpu_shader/path_count.rs
@@ -31,6 +31,9 @@
         if dx + dy == 0.0 {
             continue;
         }
+        if dy == 0.0 && s0.y.floor() == s0.y {
+            continue;
+        }
         let dy_dxdy = dy / (dx + dy);
         let a = 1.0 - dy_dxdy;
         let is_positive_slope = s1.x >= s0.x;
@@ -52,7 +55,7 @@
         ];
         let xmin = s0.x.min(s1.x);
         let stride = bbox[2] - bbox[0];
-        if s0.y >= bbox[3] as f32 || s1.y <= bbox[1] as f32 || xmin >= bbox[2] as f32 || stride == 0
+        if s0.y >= bbox[3] as f32 || s1.y < bbox[1] as f32 || xmin >= bbox[2] as f32 || stride == 0
         {
             continue;
         }
@@ -76,7 +79,7 @@
         let delta = if is_down { -1 } else { 1 };
         let mut ymin = 0;
         let mut ymax = 0;
-        if s0.x.max(s1.x) <= bbox[0] as f32 {
+        if s0.x.max(s1.x) < bbox[0] as f32 {
             ymin = s0.y.ceil() as i32;
             ymax = s1.y.ceil() as i32;
             imax = imin;
diff --git a/src/shaders.rs b/src/shaders.rs
index 95065a7..0d21501 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -305,7 +305,7 @@
         preprocess::preprocess(shader!("path_tiling_setup"), &empty, &imports).into(),
         &[BindType::BufReadOnly, BindType::Buffer],
     )?;
-    //engine.set_cpu_shader(path_tiling_setup, cpu_shader::path_tiling_setup);
+    engine.set_cpu_shader(path_tiling_setup, cpu_shader::path_tiling_setup);
     let path_tiling = engine.add_shader(
         device,
         "path_tiling",
@@ -319,7 +319,7 @@
             BindType::Buffer,
         ],
     )?;
-    //engine.set_cpu_shader(path_tiling, cpu_shader::path_tiling);
+    engine.set_cpu_shader(path_tiling, cpu_shader::path_tiling);
     let fine = engine.add_shader(
         device,
         "fine",