Improve numerical robustness of conservative line rasterization

Apply additional logic to make sure that first and last tiles of conservative line rasterization land on the right tile.

Note: this fixes the assert and should avoid artifacts, but still doesn't render the poland svg correctly (there are black splotches).

Fixes #376 and #378
diff --git a/shader/fine.wgsl b/shader/fine.wgsl
index f41747d..5d500ad 100644
--- a/shader/fine.wgsl
+++ b/shader/fine.wgsl
@@ -77,6 +77,10 @@
 
 let SEG_SIZE = 5u;
 
+// See cpu_shaders/util.rs for explanation of these.
+let ONE_MINUS_ULP: f32 = 0.99999994;
+let ROBUST_EPSILON: f32 = 2e-7;
+
 // New multisampled algorithm.
 fn fill_path_ms(fill: CmdFill, wg_id: vec2<u32>, local_id: vec2<u32>) -> array<f32, PIXELS_PER_THREAD> {
     let n_segs = fill.size_and_rule >> 1u;
@@ -175,16 +179,21 @@
             // One alternative is to compute it in a separate dispatch.
             let dx = abs(xy1.x - xy0.x);
             let dy = xy1.y - xy0.y;
-            // TODO: apply numerical robustness and optimization
-            let dy_dxdy = dy / (dx + dy);
-            let a = dx / (dx + dy);
+            let idxdy = 1.0 / (dx + dy);
+            var a = dx * idxdy;
             let is_positive_slope = xy1.x >= xy0.x;
             let sign = select(-1.0, 1.0, is_positive_slope);
             let xt0 = floor(xy0.x * sign);
             let c = xy0.x * sign - xt0;
             let y0i = floor(xy0.y);
             let ytop = y0i + 1.0;
-            let b = dy_dxdy * c + a * (ytop - xy0.y);
+            let b = min((dy * c + dx * (ytop - xy0.y)) * idxdy, ONE_MINUS_ULP);
+            let count_x = span(xy0.x, xy1.x) - 1u;
+            let count = count_x + span(xy0.y, xy1.y);
+            let robust_err = floor(a * (f32(count) - 1.0) + b) - f32(count_x);
+            if robust_err != 0.0 {
+                a -= ROBUST_EPSILON * sign(robust_err);
+            }
             let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
             // Use line equation to plot pixel coordinates
 
diff --git a/shader/path_count.wgsl b/shader/path_count.wgsl
index e34ff7a..dabb6d1 100644
--- a/shader/path_count.wgsl
+++ b/shader/path_count.wgsl
@@ -36,6 +36,10 @@
     return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
 }
 
+// See cpu_shaders/util.rs for explanation of these.
+let ONE_MINUS_ULP: f32 = 0.99999994;
+let ROBUST_EPSILON: f32 = 2e-7;
+
 // Note regarding clipping to bounding box:
 //
 // We have to do the backdrop bumps for all tiles to the left of the bbox.
@@ -57,7 +61,8 @@
         let xy1 = select(line.p0, line.p1, is_down);
         let s0 = xy0 * TILE_SCALE;
         let s1 = xy1 * TILE_SCALE;
-        count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let count_x = span(s0.x, s1.x) - 1u;
+        count = count_x + span(s0.y, s1.y);
         let line_ix = global_id.x;
 
         let dx = abs(s1.x - s0.x);
@@ -72,14 +77,18 @@
             return;
         }
         let idxdy = 1.0 / (dx + dy);
-        let a = dx * idxdy;
+        var a = dx * idxdy;
         let is_positive_slope = s1.x >= s0.x;
         let sign = select(-1.0, 1.0, is_positive_slope);
         let xt0 = floor(s0.x * sign);
         let c = s0.x * sign - xt0;
         let y0 = floor(s0.y);
         let ytop = select(y0 + 1.0, ceil(s0.y), s0.y == s1.y);
-        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let b = min((dy * c + dx * (ytop - s0.y)) * idxdy, ONE_MINUS_ULP);
+        let robust_err = floor(a * (f32(count) - 1.0) + b) - f32(count_x);
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON * sign(robust_err);
+        }
         let x0 = xt0 * sign + select(-1.0, 0.0, is_positive_slope);
 
         let path = paths[line.path_ix];
diff --git a/shader/path_tiling.wgsl b/shader/path_tiling.wgsl
index ace01ab..fad5d72 100644
--- a/shader/path_tiling.wgsl
+++ b/shader/path_tiling.wgsl
@@ -29,6 +29,10 @@
     return u32(max(ceil(max(a, b)) - floor(min(a, b)), 1.0));
 }
 
+// See cpu_shaders/util.rs for explanation of these.
+let ONE_MINUS_ULP: f32 = 0.99999994;
+let ROBUST_EPSILON: f32 = 2e-7;
+
 // One invocation for each tile that is to be written.
 // Total number of invocations = bump.seg_counts
 @compute @workgroup_size(256)
@@ -49,20 +53,25 @@
         var xy1 = select(line.p0, line.p1, is_down);
         let s0 = xy0 * TILE_SCALE;
         let s1 = xy1 * TILE_SCALE;
-        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1u;
+        let count_x = span(s0.x, s1.x) - 1u;
+        let count = count_x + span(s0.y, s1.y);
         let dx = abs(s1.x - s0.x);
         let dy = s1.y - s0.y;
         // Division by zero can't happen because zero-length lines
         // have already been discarded in the path_count stage.
         let idxdy = 1.0 / (dx + dy);
-        let a = dx * idxdy;
+        var a = dx * idxdy;
         let is_positive_slope = s1.x >= s0.x;
         let sign = select(-1.0, 1.0, is_positive_slope);
         let xt0 = floor(s0.x * sign);
         let c = s0.x * sign - xt0;
         let y0i = floor(s0.y);
         let ytop = select(y0i + 1.0, ceil(s0.y), s0.y == s1.y);
-        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let b = min((dy * c + dx * (ytop - s0.y)) * idxdy, ONE_MINUS_ULP);
+        let robust_err = floor(a * (f32(count) - 1.0) + b) - f32(count_x);
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON * sign(robust_err);
+        }
         let x0i = i32(xt0 * sign + 0.5 * (sign - 1.0));
         let z = floor(a * f32(seg_within_line) + b);
         let x = x0i + i32(sign * z);
diff --git a/src/cpu_shader/path_count.rs b/src/cpu_shader/path_count.rs
index 2cee5b8..3cb8855 100644
--- a/src/cpu_shader/path_count.rs
+++ b/src/cpu_shader/path_count.rs
@@ -5,7 +5,7 @@
 
 use crate::cpu_dispatch::CpuBinding;
 
-use super::util::{span, Vec2};
+use super::util::{span, Vec2, ONE_MINUS_ULP, ROBUST_EPSILON};
 
 const TILE_SCALE: f32 = 1.0 / 16.0;
 
@@ -24,7 +24,8 @@
         let (xy0, xy1) = if is_down { (p0, p1) } else { (p1, p0) };
         let s0 = xy0 * TILE_SCALE;
         let s1 = xy1 * TILE_SCALE;
-        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1;
+        let count_x = span(s0.x, s1.x) - 1;
+        let count = count_x + span(s0.y, s1.y);
 
         let dx = (s1.x - s0.x).abs();
         let dy = s1.y - s0.y;
@@ -35,14 +36,18 @@
             continue;
         }
         let idxdy = 1.0 / (dx + dy);
-        let a = dx * idxdy;
+        let mut a = dx * idxdy;
         let is_positive_slope = s1.x >= s0.x;
         let sign = if is_positive_slope { 1.0 } else { -1.0 };
         let xt0 = (s0.x * sign).floor();
         let c = s0.x * sign - xt0;
         let y0 = s0.y.floor();
         let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 };
-        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let b = ((dy * c + dx * (ytop - s0.y)) * idxdy).min(ONE_MINUS_ULP);
+        let robust_err = (a * (count as f32 - 1.0) + b).floor() - count_x as f32;
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON.copysign(robust_err);
+        }
         let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 };
 
         let path = paths[line.path_ix as usize];
diff --git a/src/cpu_shader/path_tiling.rs b/src/cpu_shader/path_tiling.rs
index 41549bb..56bc2b4 100644
--- a/src/cpu_shader/path_tiling.rs
+++ b/src/cpu_shader/path_tiling.rs
@@ -3,7 +3,10 @@
 
 use vello_encoding::{BumpAllocators, LineSoup, Path, PathSegment, SegmentCount, Tile};
 
-use crate::cpu_dispatch::CpuBinding;
+use crate::{
+    cpu_dispatch::CpuBinding,
+    cpu_shader::util::{ONE_MINUS_ULP, ROBUST_EPSILON},
+};
 
 use super::util::{span, Vec2};
 
@@ -33,19 +36,24 @@
         let (mut xy0, mut xy1) = if is_down { (p0, p1) } else { (p1, p0) };
         let s0 = xy0 * TILE_SCALE;
         let s1 = xy1 * TILE_SCALE;
-        let count = span(s0.x, s1.x) + span(s0.y, s1.y) - 1;
+        let count_x = span(s0.x, s1.x) - 1;
+        let count = count_x + span(s0.y, s1.y);
 
         let dx = (s1.x - s0.x).abs();
         let dy = s1.y - s0.y;
         let idxdy = 1.0 / (dx + dy);
-        let a = dx * idxdy;
+        let mut a = dx * idxdy;
         let is_positive_slope = s1.x >= s0.x;
         let sign = if is_positive_slope { 1.0 } else { -1.0 };
         let xt0 = (s0.x * sign).floor();
         let c = s0.x * sign - xt0;
         let y0 = s0.y.floor();
         let ytop = if s0.y == s1.y { s0.y.ceil() } else { y0 + 1.0 };
-        let b = (dy * c + dx * (ytop - s0.y)) * idxdy;
+        let b = ((dy * c + dx * (ytop - s0.y)) * idxdy).min(ONE_MINUS_ULP);
+        let robust_err = (a * (count as f32 - 1.0) + b).floor() - count_x as f32;
+        if robust_err != 0.0 {
+            a -= ROBUST_EPSILON.copysign(robust_err);
+        }
         let x0 = xt0 * sign + if is_positive_slope { 0.0 } else { -1.0 };
         let z = (a * seg_within_line as f32 + b).floor();
         let x = x0 as i32 + (sign * z) as i32;
diff --git a/src/cpu_shader/util.rs b/src/cpu_shader/util.rs
index 2bb3279..eae6f8b 100644
--- a/src/cpu_shader/util.rs
+++ b/src/cpu_shader/util.rs
@@ -111,3 +111,18 @@
         DRAWTAG_NOP
     }
 }
+
+/// The largest floating point value strictly less than 1.
+///
+/// This value is used to limit the value of b so that its floor is strictly less
+/// than 1. That guarantees that floor(a * i + b) == 0 for i == 0, which lands on
+/// the correct first tile.
+pub const ONE_MINUS_ULP: f32 = 0.99999994;
+
+/// An epsilon to be applied in path numerical robustness.
+///
+/// When floor(a * (n - 1) + b) does not match the expected value (the width in
+/// grid cells minus one), this delta is applied to a to push it in the correct
+/// direction. The theory is that a is not off by more than a few ulp, and it's
+/// always in the range of 0..1.
+pub const ROBUST_EPSILON: f32 = 2e-7;