Add inner joins

Draw inner join for almost all joins. The exception is when the join angle is tiny. It shouldn't be too hard to add optimizations, especially when it's possible to evaluate that the join is entirely inside one side of the stroke.

It's under an #ifdef, the old code is not changed substantially.
diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index dede2f2..e44ac6f 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -421,7 +421,7 @@
             let normalized_offset = offset / cubic_params.chord_len;
             let dist_scaled = normalized_offset * es.params.ch;
 // NOTE: set this to "ifndef" to lower to arcs before flattening. Use ifdef to lower directly to lines.
-#ifndef arcs
+#ifdef arcs
             let arclen = length(es.p0 - es.p1) / es.params.ch;
             let est_err = (1. / 120.) / tol * abs(k1) * (arclen + 0.4 * abs(k1 * offset));
             let n_subdiv = cbrt(est_err);
@@ -616,10 +616,19 @@
 
     let cr = tan_prev.x * tan_next.y - tan_prev.y * tan_next.x;
     let d = dot(tan_prev, tan_next);
+#ifdef inner_join
+    let is_backside = cr > 0.;
+#endif
 
     switch style_flags & STYLE_FLAGS_JOIN_MASK {
         case STYLE_FLAGS_JOIN_BEVEL: {
+#ifdef inner_join
+            let p0 = select(front0, back0, is_backside);
+            let p1 = select(front1, back1, is_backside);
+            output_line_with_transform(path_ix, p0, p1, transform);
+#else
             output_two_lines_with_transform(path_ix, front0, front1, back0, back1, transform);
+#endif
         }
         case STYLE_FLAGS_JOIN_MITER: {
             let hypot = length(vec2f(cr, d));
@@ -627,7 +636,9 @@
 
             var line_ix: u32;
             if 2. * hypot < (hypot + d) * miter_limit * miter_limit && cr != 0. {
+#ifndef inner_join
                 let is_backside = cr > 0.;
+#endif
                 let fp_last = select(front0, back1, is_backside);
                 let fp_this = select(front1, back0, is_backside);
                 let p = select(front0, back0, is_backside);
@@ -636,7 +647,11 @@
                 let h = (tan_prev.x * v.y - tan_prev.y * v.x) / cr;
                 let miter_pt = fp_this - tan_next * h;
 
+#ifdef inner_join
+                line_ix = atomicAdd(&bump.lines, 2u);
+#else
                 line_ix = atomicAdd(&bump.lines, 3u);
+#endif
                 write_line_with_transform(line_ix, path_ix, p, miter_pt, transform);
                 line_ix += 1u;
 
@@ -646,10 +661,22 @@
                     front0 = miter_pt;
                 }
             } else {
+#ifdef inner_join
+                line_ix = atomicAdd(&bump.lines, 1u);
+#else
                 line_ix = atomicAdd(&bump.lines, 2u);
+#endif
             }
+#ifdef inner_join
+            if is_backside {
+                write_line_with_transform(line_ix, path_ix, back0, back1, transform);
+            } else {
+                write_line_with_transform(line_ix, path_ix, front0, front1, transform);
+            }
+#else
             write_line_with_transform(line_ix, path_ix, front0, front1, transform);
             write_line_with_transform(line_ix + 1u, path_ix, back0, back1, transform);
+#endif
         }
         case STYLE_FLAGS_JOIN_ROUND: {
             var arc0: vec2f;
@@ -668,10 +695,32 @@
                 other1 = back1;
             }
             flatten_arc(path_ix, arc0, arc1, p0, abs(atan2(cr, d)), transform);
+#ifndef inner_join
             output_line_with_transform(path_ix, other0, other1, transform);
+#endif
         }
         default: {}
     }
+#ifdef inner_join
+    // Handle inner join
+    if abs(cr) < 1e-6 {
+        // smooth join, don't need to draw inner join
+        let inner0 = select(back0, front0, is_backside);
+        let inner1 = select(back1, front1, is_backside);
+        if any(inner0 != inner1) {
+            output_line_with_transform(path_ix, inner0, inner1, transform);
+        }
+    } else {
+        let inner0 = select(back0, front0, is_backside);
+        let inner1 = select(back1, front1, is_backside);
+        let line_ix = atomicAdd(&bump.lines, 4u);
+        write_line_with_transform(line_ix, path_ix, inner0, p0, transform);
+        write_line_with_transform(line_ix + 1, path_ix, p0, inner1, transform);
+        write_line_with_transform(line_ix + 2, path_ix, inner0, p0, transform);
+        write_line_with_transform(line_ix + 3, path_ix, p0, inner1, transform);
+        flatten_arc(path_ix, inner1, inner0, p0, -abs(atan2(cr, d)), transform);
+    }
+#endif
 }
 
 fn read_f32_point(ix: u32) -> vec2f {
diff --git a/src/shaders.rs b/src/shaders.rs
index e00c1a9..d5df41f 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -95,6 +95,8 @@
     let mut small_config = HashSet::new();
     small_config.insert("full".into());
     small_config.insert("small".into());
+    let mut flatten_config = HashSet::new();
+    flatten_config.insert("inner_join".into());
 
     let mut force_gpu = false;
 
@@ -166,7 +168,8 @@
     let bbox_clear = add_shader!(bbox_clear, [Uniform, Buffer], &empty);
     let flatten = add_shader!(
         flatten,
-        [Uniform, BufReadOnly, BufReadOnly, Buffer, Buffer, Buffer]
+        [Uniform, BufReadOnly, BufReadOnly, Buffer, Buffer, Buffer],
+        &flatten_config
     );
     let draw_reduce = add_shader!(draw_reduce, [Uniform, BufReadOnly, Buffer], &empty);
     let draw_leaf = add_shader!(