piet-gpu/shader/path_coarse.comp - external/github.com/linebender/vello - Git at Google

 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

 // Coarse rasterization of path segments.

 // Allocation and initialization of tiles for paths.

 #version 450
 #extension GL_GOOGLE_include_directive : enable

 #include "mem.h"
 #include "setup.h"

 #define LG_COARSE_WG 5
 #define COARSE_WG (1 << LG_COARSE_WG)

 layout(local_size_x = COARSE_WG, local_size_y = 1) in;

 layout(set = 0, binding = 1) readonly buffer ConfigBuf {
     Config conf;
 };

 #include "pathseg.h"
 #include "tile.h"

 // scale factors useful for converting coordinates to tiles
 #define SX (1.0 / float(TILE_WIDTH_PX))
 #define SY (1.0 / float(TILE_HEIGHT_PX))

 #define ACCURACY 0.25
 #define Q_ACCURACY (ACCURACY * 0.1)
 #define REM_ACCURACY (ACCURACY - Q_ACCURACY)
 #define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
 #define MAX_QUADS 16

 vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
     float mt = 1.0 - t;
     return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
 }

 vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
     float mt = 1.0 - t;
     return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
 }

 struct SubdivResult {
     float val;
     float a0;
     float a2;
 };

 /// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
 ///
 /// This is used for flattening curves.
 #define D 0.67
 float approx_parabola_integral(float x) {
     return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
 }

 /// An approximation to the inverse parabola integral.
 #define B 0.39
 float approx_parabola_inv_integral(float x) {
     return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
 }

 SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
     vec2 d01 = p1 - p0;
     vec2 d12 = p2 - p1;
     vec2 dd = d01 - d12;
     float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
     float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
     float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
     float scale = abs(cross / (length(dd) * (x2 - x0)));

     float a0 = approx_parabola_integral(x0);
     float a2 = approx_parabola_integral(x2);
     float val = 0.0;
     if (scale < 1e9) {
         float da = abs(a2 - a0);
         float sqrt_scale = sqrt(scale);
         if (sign(x0) == sign(x2)) {
             val = da * sqrt_scale;
         } else {
             float xmin = sqrt_tol / sqrt_scale;
             val = sqrt_tol * da / approx_parabola_integral(xmin);
         }
     }
     return SubdivResult(val, a0, a2);
 }

 void main() {
     uint element_ix = gl_GlobalInvocationID.x;
     PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);

     PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
     if (element_ix < conf.n_pathseg) {
         tag = PathSeg_tag(conf.pathseg_alloc, ref);
     }
     bool mem_ok = mem_error == NO_ERROR;
     switch (tag.tag) {
     case PathSeg_Cubic:
         PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);

         uint trans_ix = cubic.trans_ix;
         if (trans_ix > 0) {
             TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
             TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
             cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
             cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
             cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
             cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
         }

         vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
         float err = err_v.x * err_v.x + err_v.y * err_v.y;
         // The number of quadratics.
         uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
         n_quads = min(n_quads, MAX_QUADS);
         SubdivResult keep_params[MAX_QUADS];
         // Iterate over quadratics and tote up the estimated number of segments.
         float val = 0.0;
         vec2 qp0 = cubic.p0;
         float step = 1.0 / float(n_quads);
         for (uint i = 0; i < n_quads; i++) {
             float t = float(i + 1) * step;
             vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
             vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
             qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
             SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
             keep_params[i] = params;
             val += params.val;

             qp0 = qp2;
         }
         uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);

         bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
         uint path_ix = cubic.path_ix;
         Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
         Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
         ivec4 bbox = ivec4(path.bbox);
         vec2 p0 = cubic.p0;
         qp0 = cubic.p0;
         float v_step = val / float(n);
         int n_out = 1;
         float val_sum = 0.0;
         for (uint i = 0; i < n_quads; i++) {
             float t = float(i + 1) * step;
             vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
             vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
             qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
             SubdivResult params = keep_params[i];
             float u0 = approx_parabola_inv_integral(params.a0);
             float u2 = approx_parabola_inv_integral(params.a2);
             float uscale = 1.0 / (u2 - u0);
             float target = float(n_out) * v_step;
             while (n_out == n || target < val_sum + params.val) {
                 vec2 p1;
                 if (n_out == n) {
                     p1 = cubic.p3;
                 } else {
                     float u = (target - val_sum) / params.val;
                     float a = mix(params.a0, params.a2, u);
                     float au = approx_parabola_inv_integral(a);
                     float t = (au - u0) * uscale;
                     p1 = eval_quad(qp0, qp1, qp2, t);
                 }

                 // Output line segment

                 // Bounding box of element in pixel coordinates.
                 float xmin = min(p0.x, p1.x) - cubic.stroke.x;
                 float xmax = max(p0.x, p1.x) + cubic.stroke.x;
                 float ymin = min(p0.y, p1.y) - cubic.stroke.y;
                 float ymax = max(p0.y, p1.y) + cubic.stroke.y;
                 float dx = p1.x - p0.x;
                 float dy = p1.y - p0.y;
                 // Set up for per-scanline coverage formula, below.
                 float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
                 float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
                 float b = invslope; // Note: assumes square tiles, otherwise scale.
                 float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;

                 int x0 = int(floor(xmin * SX));
                 int x1 = int(floor(xmax * SX) + 1);
                 int y0 = int(floor(ymin * SY));
                 int y1 = int(floor(ymax * SY) + 1);

                 x0 = clamp(x0, bbox.x, bbox.z);
                 y0 = clamp(y0, bbox.y, bbox.w);
                 x1 = clamp(x1, bbox.x, bbox.z);
                 y1 = clamp(y1, bbox.y, bbox.w);
                 float xc = a + b * float(y0);
                 int stride = bbox.z - bbox.x;
                 int base = (y0 - bbox.y) * stride - bbox.x;
                 // TODO: can be tighter, use c to bound width
                 uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                 // Consider using subgroups to aggregate atomic add.
                 MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
                 if (tile_alloc.failed || !mem_ok) {
                     return;
                 }
                 uint tile_offset = tile_alloc.alloc.offset;

                 TileSeg tile_seg;

                 int xray = int(floor(p0.x*SX));
                 int last_xray = int(floor(p1.x*SX));
                 if (p0.y > p1.y) {
                     int tmp = xray;
                     xray = last_xray;
                     last_xray = tmp;
                 }
                 for (int y = y0; y < y1; y++) {
                     float tile_y0 = float(y * TILE_HEIGHT_PX);
                     int xbackdrop = max(xray + 1, bbox.x);
                     if (!is_stroke && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
                         int backdrop = p1.y < p0.y ? 1 : -1;
                         TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
                         uint tile_el = tile_ref.offset >> 2;
                         if (touch_mem(path_alloc, tile_el + 1)) {
                             atomicAdd(memory[tile_el + 1], backdrop);
                         }
                     }

                     // next_xray is the xray for the next scanline; the line segment intersects
                     // all tiles between xray and next_xray.
                     int next_xray = last_xray;
                     if (y < y1 - 1) {
                         float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
                         float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
                         next_xray = int(floor(x_edge*SX));
                     }

                     int min_xray = min(xray, next_xray);
                     int max_xray = max(xray, next_xray);
                     int xx0 = min(int(floor(xc - c)), min_xray);
                     int xx1 = max(int(ceil(xc + c)), max_xray + 1);
                     xx0 = clamp(xx0, x0, x1);
                     xx1 = clamp(xx1, x0, x1);

                     for (int x = xx0; x < xx1; x++) {
                         float tile_x0 = float(x * TILE_WIDTH_PX);
                         TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
                         uint tile_el = tile_ref.offset >> 2;
                         uint old = 0;
                         if (touch_mem(path_alloc, tile_el)) {
                             old = atomicExchange(memory[tile_el], tile_offset);
                         }
                         tile_seg.origin = p0;
                         tile_seg.vector = p1 - p0;
                         float y_edge = 0.0;
                         if (!is_stroke) {
                             y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
                             if (min(p0.x, p1.x) < tile_x0) {
                                 vec2 p = vec2(tile_x0, y_edge);
                                 if (p0.x > p1.x) {
                                     tile_seg.vector = p - p0;
                                 } else {
                                     tile_seg.origin = p;
                                     tile_seg.vector = p1 - p;
                                 }
                                 // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
                                 // Nudge zeroes towards the intended sign.
                                 if (tile_seg.vector.x == 0) {
                                     tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
                                 }
                             }
                             if (x <= min_xray || max_xray < x) {
                                 // Reject inconsistent intersections.
                                 y_edge = 1e9;
                             }
                         }
                         tile_seg.y_edge = y_edge;
                         tile_seg.next.offset = old;
                         TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
                         tile_offset += TileSeg_size;
                     }
                     xc += b;
                     base += stride;
                     xray = next_xray;
                 }

                 n_out += 1;
                 target += v_step;
                 p0 = p1;
             }
             val_sum += params.val;

             qp0 = qp2;
         }

         break;
     }
 }
	// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

	// Coarse rasterization of path segments.

	// Allocation and initialization of tiles for paths.

	#version 450
	#extension GL_GOOGLE_include_directive : enable

	#include "mem.h"
	#include "setup.h"

	#define LG_COARSE_WG 5
	#define COARSE_WG (1 << LG_COARSE_WG)

	layout(local_size_x = COARSE_WG, local_size_y = 1) in;

	layout(set = 0, binding = 1) readonly buffer ConfigBuf {
	Config conf;
	};

	#include "pathseg.h"
	#include "tile.h"

	// scale factors useful for converting coordinates to tiles
	#define SX (1.0 / float(TILE_WIDTH_PX))
	#define SY (1.0 / float(TILE_HEIGHT_PX))

	#define ACCURACY 0.25
	#define Q_ACCURACY (ACCURACY * 0.1)
	#define REM_ACCURACY (ACCURACY - Q_ACCURACY)
	#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
	#define MAX_QUADS 16

	vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
	float mt = 1.0 - t;
	return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
	}

	vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
	float mt = 1.0 - t;
	return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
	}

	struct SubdivResult {
	float val;
	float a0;
	float a2;
	};

	/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
	///
	/// This is used for flattening curves.
	#define D 0.67
	float approx_parabola_integral(float x) {
	return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
	}

	/// An approximation to the inverse parabola integral.
	#define B 0.39
	float approx_parabola_inv_integral(float x) {
	return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
	}

	SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
	vec2 d01 = p1 - p0;
	vec2 d12 = p2 - p1;
	vec2 dd = d01 - d12;
	float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
	float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
	float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
	float scale = abs(cross / (length(dd) * (x2 - x0)));

	float a0 = approx_parabola_integral(x0);
	float a2 = approx_parabola_integral(x2);
	float val = 0.0;
	if (scale < 1e9) {
	float da = abs(a2 - a0);
	float sqrt_scale = sqrt(scale);
	if (sign(x0) == sign(x2)) {
	val = da * sqrt_scale;
	} else {
	float xmin = sqrt_tol / sqrt_scale;
	val = sqrt_tol * da / approx_parabola_integral(xmin);
	}
	}
	return SubdivResult(val, a0, a2);
	}

	void main() {
	uint element_ix = gl_GlobalInvocationID.x;
	PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);

	PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
	if (element_ix < conf.n_pathseg) {
	tag = PathSeg_tag(conf.pathseg_alloc, ref);
	}
	bool mem_ok = mem_error == NO_ERROR;
	switch (tag.tag) {
	case PathSeg_Cubic:
	PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);

	uint trans_ix = cubic.trans_ix;
	if (trans_ix > 0) {
	TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
	TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
	cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
	cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
	cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
	cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
	}

	vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
	float err = err_v.x * err_v.x + err_v.y * err_v.y;
	// The number of quadratics.
	uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
	n_quads = min(n_quads, MAX_QUADS);
	SubdivResult keep_params[MAX_QUADS];
	// Iterate over quadratics and tote up the estimated number of segments.
	float val = 0.0;
	vec2 qp0 = cubic.p0;
	float step = 1.0 / float(n_quads);
	for (uint i = 0; i < n_quads; i++) {
	float t = float(i + 1) * step;
	vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
	vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
	qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
	SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
	keep_params[i] = params;
	val += params.val;

	qp0 = qp2;
	}
	uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);

	bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
	uint path_ix = cubic.path_ix;
	Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
	Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
	ivec4 bbox = ivec4(path.bbox);
	vec2 p0 = cubic.p0;
	qp0 = cubic.p0;
	float v_step = val / float(n);
	int n_out = 1;
	float val_sum = 0.0;
	for (uint i = 0; i < n_quads; i++) {
	float t = float(i + 1) * step;
	vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
	vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
	qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
	SubdivResult params = keep_params[i];
	float u0 = approx_parabola_inv_integral(params.a0);
	float u2 = approx_parabola_inv_integral(params.a2);
	float uscale = 1.0 / (u2 - u0);
	float target = float(n_out) * v_step;
	while (n_out == n \|\| target < val_sum + params.val) {
	vec2 p1;
	if (n_out == n) {
	p1 = cubic.p3;
	} else {
	float u = (target - val_sum) / params.val;
	float a = mix(params.a0, params.a2, u);
	float au = approx_parabola_inv_integral(a);
	float t = (au - u0) * uscale;
	p1 = eval_quad(qp0, qp1, qp2, t);
	}

	// Output line segment

	// Bounding box of element in pixel coordinates.
	float xmin = min(p0.x, p1.x) - cubic.stroke.x;
	float xmax = max(p0.x, p1.x) + cubic.stroke.x;
	float ymin = min(p0.y, p1.y) - cubic.stroke.y;
	float ymax = max(p0.y, p1.y) + cubic.stroke.y;
	float dx = p1.x - p0.x;
	float dy = p1.y - p0.y;
	// Set up for per-scanline coverage formula, below.
	float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
	float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
	float b = invslope; // Note: assumes square tiles, otherwise scale.
	float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;

	int x0 = int(floor(xmin * SX));
	int x1 = int(floor(xmax * SX) + 1);
	int y0 = int(floor(ymin * SY));
	int y1 = int(floor(ymax * SY) + 1);

	x0 = clamp(x0, bbox.x, bbox.z);
	y0 = clamp(y0, bbox.y, bbox.w);
	x1 = clamp(x1, bbox.x, bbox.z);
	y1 = clamp(y1, bbox.y, bbox.w);
	float xc = a + b * float(y0);
	int stride = bbox.z - bbox.x;
	int base = (y0 - bbox.y) * stride - bbox.x;
	// TODO: can be tighter, use c to bound width
	uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
	// Consider using subgroups to aggregate atomic add.
	MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
	if (tile_alloc.failed \|\| !mem_ok) {
	return;
	}
	uint tile_offset = tile_alloc.alloc.offset;

	TileSeg tile_seg;

	int xray = int(floor(p0.x*SX));
	int last_xray = int(floor(p1.x*SX));
	if (p0.y > p1.y) {
	int tmp = xray;
	xray = last_xray;
	last_xray = tmp;
	}
	for (int y = y0; y < y1; y++) {
	float tile_y0 = float(y * TILE_HEIGHT_PX);
	int xbackdrop = max(xray + 1, bbox.x);
	if (!is_stroke && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
	int backdrop = p1.y < p0.y ? 1 : -1;
	TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
	uint tile_el = tile_ref.offset >> 2;
	if (touch_mem(path_alloc, tile_el + 1)) {
	atomicAdd(memory[tile_el + 1], backdrop);
	}
	}

	// next_xray is the xray for the next scanline; the line segment intersects
	// all tiles between xray and next_xray.
	int next_xray = last_xray;
	if (y < y1 - 1) {
	float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
	float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
	next_xray = int(floor(x_edge*SX));
	}

	int min_xray = min(xray, next_xray);
	int max_xray = max(xray, next_xray);
	int xx0 = min(int(floor(xc - c)), min_xray);
	int xx1 = max(int(ceil(xc + c)), max_xray + 1);
	xx0 = clamp(xx0, x0, x1);
	xx1 = clamp(xx1, x0, x1);

	for (int x = xx0; x < xx1; x++) {
	float tile_x0 = float(x * TILE_WIDTH_PX);
	TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
	uint tile_el = tile_ref.offset >> 2;
	uint old = 0;
	if (touch_mem(path_alloc, tile_el)) {
	old = atomicExchange(memory[tile_el], tile_offset);
	}
	tile_seg.origin = p0;
	tile_seg.vector = p1 - p0;
	float y_edge = 0.0;
	if (!is_stroke) {
	y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
	if (min(p0.x, p1.x) < tile_x0) {
	vec2 p = vec2(tile_x0, y_edge);
	if (p0.x > p1.x) {
	tile_seg.vector = p - p0;
	} else {
	tile_seg.origin = p;
	tile_seg.vector = p1 - p;
	}
	// kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
	// Nudge zeroes towards the intended sign.
	if (tile_seg.vector.x == 0) {
	tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
	}
	}
	if (x <= min_xray \|\| max_xray < x) {
	// Reject inconsistent intersections.
	y_edge = 1e9;
	}
	}
	tile_seg.y_edge = y_edge;
	tile_seg.next.offset = old;
	TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
	tile_offset += TileSeg_size;
	}
	xc += b;
	base += stride;
	xray = next_xray;
	}

	n_out += 1;
	target += v_step;
	p0 = p1;
	}
	val_sum += params.val;

	qp0 = qp2;
	}

	break;
	}
	}